Skip to content

Commit a76609d

Browse files
authored
[flang][cuda] Avoid intrinsics simplification in device context (llvm#117026)
1 parent f881a38 commit a76609d

File tree

4 files changed

+71
-0
lines changed

4 files changed

+71
-0
lines changed

flang/include/flang/Optimizer/Transforms/CUFCommon.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ namespace cuf {
2020
mlir::gpu::GPUModuleOp getOrCreateGPUModule(mlir::ModuleOp mod,
2121
mlir::SymbolTable &symTab);
2222

23+
bool isInCUDADeviceContext(mlir::Operation *op);
24+
2325
} // namespace cuf
2426

2527
#endif // FORTRAN_OPTIMIZER_TRANSFORMS_CUFCOMMON_H_

flang/lib/Optimizer/Transforms/CUFCommon.cpp

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "flang/Optimizer/Transforms/CUFCommon.h"
10+
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
11+
#include "mlir/Dialect/Func/IR/FuncOps.h"
1012
#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
1113

1214
/// Retrieve or create the CUDA Fortran GPU module in the give in \p mod.
@@ -26,3 +28,18 @@ mlir::gpu::GPUModuleOp cuf::getOrCreateGPUModule(mlir::ModuleOp mod,
2628
symTab.insert(gpuMod, insertPt);
2729
return gpuMod;
2830
}
31+
32+
bool cuf::isInCUDADeviceContext(mlir::Operation *op) {
33+
if (!op)
34+
return false;
35+
if (op->getParentOfType<cuf::KernelOp>() ||
36+
op->getParentOfType<mlir::gpu::GPUFuncOp>())
37+
return true;
38+
if (auto funcOp = op->getParentOfType<mlir::func::FuncOp>()) {
39+
if (auto cudaProcAttr = funcOp->getAttrOfType<cuf::ProcAttributeAttr>(
40+
cuf::getProcAttrName())) {
41+
return cudaProcAttr.getValue() != cuf::ProcAttribute::Host;
42+
}
43+
}
44+
return false;
45+
}

flang/lib/Optimizer/Transforms/SimplifyIntrinsics.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "flang/Optimizer/Dialect/FIRType.h"
3232
#include "flang/Optimizer/Dialect/Support/FIRContext.h"
3333
#include "flang/Optimizer/HLFIR/HLFIRDialect.h"
34+
#include "flang/Optimizer/Transforms/CUFCommon.h"
3435
#include "flang/Optimizer/Transforms/Passes.h"
3536
#include "flang/Optimizer/Transforms/Utils.h"
3637
#include "flang/Runtime/entry-names.h"
@@ -1276,6 +1277,8 @@ void SimplifyIntrinsicsPass::runOnOperation() {
12761277
fir::KindMapping kindMap = fir::getKindMapping(module);
12771278
module.walk([&](mlir::Operation *op) {
12781279
if (auto call = mlir::dyn_cast<fir::CallOp>(op)) {
1280+
if (cuf::isInCUDADeviceContext(op))
1281+
return;
12791282
if (mlir::SymbolRefAttr callee = call.getCalleeAttr()) {
12801283
mlir::StringRef funcName = callee.getLeafReference().getValue();
12811284
// Replace call to runtime function for SUM when it has single
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
// RUN: fir-opt --simplify-intrinsics %s | FileCheck %s
2+
3+
func.func @_QPsum_in_device(%arg0: !fir.ref<!fir.array<?xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %arg1: i32 {fir.bindc_name = "n"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
4+
%c5_i32 = arith.constant 5 : i32
5+
%c1 = arith.constant 1 : index
6+
%c0 = arith.constant 0 : index
7+
%c-1 = arith.constant -1 : index
8+
%0 = fir.dummy_scope : !fir.dscope
9+
%1 = fir.shape %c-1 : (index) -> !fir.shape<1>
10+
%2 = fir.declare %arg0(%1) dummy_scope %0 {data_attr = #cuf.cuda<device>, uniq_name = "_QFsum_in_deviceEa"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.dscope) -> !fir.ref<!fir.array<?xi32>>
11+
%3 = fir.embox %2(%1) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
12+
%4 = fir.alloca i32
13+
fir.store %arg1 to %4 : !fir.ref<i32>
14+
%5 = fir.declare %4 dummy_scope %0 {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QFsum_in_deviceEn"} : (!fir.ref<i32>, !fir.dscope) -> !fir.ref<i32>
15+
%12 = fir.alloca i32 {bindc_name = "i", uniq_name = "_QFsum_in_deviceEi"}
16+
%13 = fir.declare %12 {uniq_name = "_QFsum_in_deviceEi"} : (!fir.ref<i32>) -> !fir.ref<i32>
17+
%14 = fir.address_of(@_QM__fortran_builtinsE__builtin_threadidx) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
18+
%18 = fir.load %5 : !fir.ref<i32>
19+
%19 = fir.convert %18 : (i32) -> index
20+
%20 = arith.cmpi sgt, %19, %c0 : index
21+
%21 = arith.select %20, %19, %c0 : index
22+
%22 = fir.alloca !fir.array<?xi32>, %21 {bindc_name = "auto", uniq_name = "_QFsum_in_deviceEauto"}
23+
%23 = fir.shape %21 : (index) -> !fir.shape<1>
24+
%24 = fir.declare %22(%23) {uniq_name = "_QFsum_in_deviceEauto"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xi32>>
25+
%25 = fir.embox %24(%23) : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> !fir.box<!fir.array<?xi32>>
26+
%26 = fir.undefined index
27+
%27 = fir.slice %c1, %19, %c1 : (index, index, index) -> !fir.slice<1>
28+
%28 = fir.embox %24(%23) [%27] : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, !fir.slice<1>) -> !fir.box<!fir.array<?xi32>>
29+
%29 = fir.absent !fir.box<i1>
30+
%30 = fir.address_of(@_QQclX91d13f6e74caa2f03965d7a7c6a8fdd5) : !fir.ref<!fir.char<1,50>>
31+
%31 = fir.convert %28 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<none>
32+
%32 = fir.convert %30 : (!fir.ref<!fir.char<1,50>>) -> !fir.ref<i8>
33+
%33 = fir.convert %c0 : (index) -> i32
34+
%34 = fir.convert %29 : (!fir.box<i1>) -> !fir.box<none>
35+
%35 = fir.call @_FortranASumInteger4(%31, %32, %c5_i32, %33, %34) fastmath<contract> : (!fir.box<none>, !fir.ref<i8>, i32, i32, !fir.box<none>) -> i32
36+
%36 = fir.load %13 : !fir.ref<i32>
37+
%37 = fir.convert %36 : (i32) -> i64
38+
%38 = fir.array_coor %2(%1) %37 : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>, i64) -> !fir.ref<i32>
39+
fir.store %35 to %38 : !fir.ref<i32>
40+
return
41+
}
42+
43+
// Check that intrinsic simplification is disabled in CUDA Fortran context. The simplified intrinsic is
44+
// created in the module op but the device func will be migrated into a gpu module op resulting in a
45+
// missing symbol error.
46+
// The simplified intrinsic could also be migrated to the gpu module but the choice has not be made
47+
// at this point.
48+
// CHECK-LABEL: func.func @_QPsum_in_device
49+
// CHECK-NOT: fir.call @_FortranASumInteger4x1_contract_simplified

0 commit comments

Comments
 (0)