Skip to content

Commit e5ec7bb

Browse files
authored
[flang][cuda] Set correct offsets for multiple variables in dynamic shared memory (#131674)
1 parent ad8f0e2 commit e5ec7bb

File tree

2 files changed

+75
-6
lines changed

2 files changed

+75
-6
lines changed

flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,13 @@ struct CUFComputeSharedMemoryOffsetsAndSize
5858
auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
5959
mlir::Type i8Ty = builder.getI8Type();
6060
mlir::Type i32Ty = builder.getI32Type();
61+
mlir::Type idxTy = builder.getIndexType();
6162
for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
6263
unsigned nbDynamicSharedVariables = 0;
6364
unsigned nbStaticSharedVariables = 0;
6465
uint64_t sharedMemSize = 0;
6566
unsigned short alignment = 0;
67+
mlir::Value crtDynOffset;
6668

6769
// Go over each shared memory operation and compute their start offset and
6870
// the size and alignment of the global to be generated if all variables
@@ -73,16 +75,30 @@ struct CUFComputeSharedMemoryOffsetsAndSize
7375
builder.setInsertionPoint(sharedOp);
7476
if (fir::hasDynamicSize(sharedOp.getInType())) {
7577
mlir::Type ty = sharedOp.getInType();
76-
// getTypeSizeAndAlignmentOrCrash will crash trying to compute the
77-
// size of an array with dynamic size. Just get the alignment to
78-
// create the global.
7978
if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
8079
ty = seqTy.getEleTy();
8180
unsigned short align = dl->getTypeABIAlignment(ty);
82-
++nbDynamicSharedVariables;
83-
mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
84-
sharedOp.getOffsetMutable().assign(zero);
8581
alignment = std::max(alignment, align);
82+
uint64_t tySize = dl->getTypeSize(ty);
83+
++nbDynamicSharedVariables;
84+
if (crtDynOffset) {
85+
sharedOp.getOffsetMutable().assign(
86+
builder.createConvert(loc, i32Ty, crtDynOffset));
87+
} else {
88+
mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
89+
sharedOp.getOffsetMutable().assign(zero);
90+
}
91+
92+
mlir::Value dynSize =
93+
builder.createIntegerConstant(loc, idxTy, tySize);
94+
for (auto extent : sharedOp.getShape())
95+
dynSize = builder.create<mlir::arith::MulIOp>(loc, dynSize, extent);
96+
if (crtDynOffset)
97+
crtDynOffset =
98+
builder.create<mlir::arith::AddIOp>(loc, crtDynOffset, dynSize);
99+
else
100+
crtDynOffset = dynSize;
101+
86102
continue;
87103
}
88104
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(

flang/test/Fir/CUDA/cuda-shared-offset.mlir

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,3 +54,56 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
5454
// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
5555
// CHECK: }
5656
// CHECK: }
57+
58+
// -----
59+
60+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
61+
gpu.module @cuda_device_mod {
62+
gpu.func @_QMmPshareddyn(%arg0: !fir.box<!fir.array<?x?xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %arg1: !fir.box<!fir.array<?x?xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "b"}, %arg2: i32 {fir.bindc_name = "k"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
63+
%c1_i32 = arith.constant 1 : i32
64+
%c2_i32 = arith.constant 2 : i32
65+
%c0 = arith.constant 0 : index
66+
%5 = fir.address_of(@_QM__fortran_builtinsE__builtin_blockdim) : !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
67+
%6 = fir.declare %5 {uniq_name = "_QM__fortran_builtinsE__builtin_blockdim"} : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>
68+
%15 = fir.alloca i32
69+
%16 = fir.declare %15 {fortran_attrs = #fir.var_attrs<value>, uniq_name = "_QMmFss1Ek"} : (!fir.ref<i32>) -> !fir.ref<i32>
70+
%27 = fir.coordinate_of %6, x : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
71+
%28 = fir.load %27 : !fir.ref<i32>
72+
%29 = fir.convert %28 : (i32) -> i64
73+
%30 = fir.convert %29 : (i64) -> index
74+
%31 = arith.cmpi sgt, %30, %c0 : index
75+
%32 = arith.select %31, %30, %c0 : index
76+
%33 = fir.coordinate_of %6, y : (!fir.ref<!fir.type<_QM__fortran_builtinsT__builtin_dim3{x:i32,y:i32,z:i32}>>) -> !fir.ref<i32>
77+
%34 = fir.load %33 : !fir.ref<i32>
78+
%35 = fir.convert %34 : (i32) -> i64
79+
%36 = fir.convert %35 : (i64) -> index
80+
%37 = arith.cmpi sgt, %36, %c0 : index
81+
%38 = arith.select %37, %36, %c0 : index
82+
%39 = cuf.shared_memory !fir.array<?x?xi32>, %32, %38 : index, index {bindc_name = "s1", uniq_name = "_QMmFss1Es1"} -> !fir.ref<!fir.array<?x?xi32>>
83+
%40 = fir.shape %32, %38 : (index, index) -> !fir.shape<2>
84+
%41 = fir.declare %39(%40) {data_attr = #cuf.cuda<shared>, uniq_name = "_QMmFss1Es1"} : (!fir.ref<!fir.array<?x?xi32>>, !fir.shape<2>) -> !fir.ref<!fir.array<?x?xi32>>
85+
%42 = fir.load %16 : !fir.ref<i32>
86+
%43 = arith.muli %42, %c2_i32 : i32
87+
%44 = fir.convert %43 : (i32) -> i64
88+
%45 = fir.convert %44 : (i64) -> index
89+
%46 = arith.cmpi sgt, %45, %c0 : index
90+
%47 = arith.select %46, %45, %c0 : index
91+
%48 = fir.load %16 : !fir.ref<i32>
92+
%49 = fir.convert %48 : (i32) -> i64
93+
%50 = fir.convert %49 : (i64) -> index
94+
%51 = arith.cmpi sgt, %50, %c0 : index
95+
%52 = arith.select %51, %50, %c0 : index
96+
%53 = cuf.shared_memory !fir.array<?x?xi32>, %47, %52 : index, index {bindc_name = "s2", uniq_name = "_QMmFss1Es2"} -> !fir.ref<!fir.array<?x?xi32>>
97+
gpu.return
98+
}
99+
}
100+
}
101+
102+
// CHECK: gpu.func @_QMmPshareddyn(%arg0: !fir.box<!fir.array<?x?xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "a"}, %arg1: !fir.box<!fir.array<?x?xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "b"}, %arg2: i32 {fir.bindc_name = "k"}) attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
103+
// CHECK: %[[EXTENT0:.*]] = arith.select
104+
// CHECK: %[[EXTENT1:.*]] = arith.select
105+
// CHECK: %[[SIZE_EXTENT:.*]] = arith.muli %c4{{.*}}, %[[EXTENT0]] : index
106+
// CHECK: %[[DYNSIZE:.*]] = arith.muli %[[SIZE_EXTENT]], %[[EXTENT1]] : index
107+
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?x?xi32>, %9, %15 : index, index {bindc_name = "s1", uniq_name = "_QMmFss1Es1"} -> !fir.ref<!fir.array<?x?xi32>>
108+
// CHECK: %[[CONV_DYNSIZE:.*]] = fir.convert %[[DYNSIZE]] : (index) -> i32
109+
// CHECK: cuf.shared_memory[%[[CONV_DYNSIZE]] : i32] !fir.array<?x?xi32>, %26, %31 : index, index {bindc_name = "s2", uniq_name = "_QMmFss1Es2"} -> !fir.ref<!fir.array<?x?xi32>>

0 commit comments

Comments
 (0)