Skip to content

Commit 74d4fc0

Browse files
authored
[flang][cuda][NFC] Use ssa value for offset in shared memory op (#131661)
Switch from attribute to a value as we need to support dynamic offset when multiple variables are used with dynamic shared memory.
1 parent 0191307 commit 74d4fc0

File tree

6 files changed

+26
-19
lines changed

6 files changed

+26
-19
lines changed

flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -360,14 +360,14 @@ def cuf_SharedMemoryOp
360360
let arguments = (ins TypeAttr:$in_type, OptionalAttr<StrAttr>:$uniq_name,
361361
OptionalAttr<StrAttr>:$bindc_name, Variadic<AnyIntegerType>:$typeparams,
362362
Variadic<AnyIntegerType>:$shape,
363-
OptionalAttr<I32Attr>:$offset // offset in bytes from the shared memory
364-
// base address.
363+
Optional<AnyIntegerType>:$offset // offset in bytes from the shared memory
364+
// base address.
365365
);
366366

367367
let results = (outs fir_ReferenceType:$ptr);
368368

369369
let assemblyFormat = [{
370-
$in_type (`(` $typeparams^ `:` type($typeparams) `)`)?
370+
(`[` $offset^ `:` type($offset) `]`)? $in_type (`(` $typeparams^ `:` type($typeparams) `)`)?
371371
(`,` $shape^ `:` type($shape) )? attr-dict `->` qualified(type($ptr))
372372
}];
373373

flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ void cuf::SharedMemoryOp::build(
315315
bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName);
316316
build(builder, result, wrapAllocaResultType(inType),
317317
mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape,
318-
mlir::IntegerAttr{});
318+
/*offset=*/mlir::Value{});
319319
result.addAttributes(attributes);
320320
}
321321

flang/lib/Optimizer/Transforms/CUFComputeSharedMemoryOffsetsAndSize.cpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ struct CUFComputeSharedMemoryOffsetsAndSize
5757

5858
auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
5959
mlir::Type i8Ty = builder.getI8Type();
60+
mlir::Type i32Ty = builder.getI32Type();
6061
for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
6162
unsigned nbDynamicSharedVariables = 0;
6263
unsigned nbStaticSharedVariables = 0;
@@ -68,6 +69,8 @@ struct CUFComputeSharedMemoryOffsetsAndSize
6869
// are static. If this is dynamic shared memory, then only the alignment
6970
// is computed.
7071
for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
72+
mlir::Location loc = sharedOp.getLoc();
73+
builder.setInsertionPoint(sharedOp);
7174
if (fir::hasDynamicSize(sharedOp.getInType())) {
7275
mlir::Type ty = sharedOp.getInType();
7376
// getTypeSizeAndAlignmentOrCrash will crash trying to compute the
@@ -77,14 +80,17 @@ struct CUFComputeSharedMemoryOffsetsAndSize
7780
ty = seqTy.getEleTy();
7881
unsigned short align = dl->getTypeABIAlignment(ty);
7982
++nbDynamicSharedVariables;
80-
sharedOp.setOffset(0);
83+
mlir::Value zero = builder.createIntegerConstant(loc, i32Ty, 0);
84+
sharedOp.getOffsetMutable().assign(zero);
8185
alignment = std::max(alignment, align);
8286
continue;
8387
}
8488
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
8589
sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
8690
++nbStaticSharedVariables;
87-
sharedOp.setOffset(llvm::alignTo(sharedMemSize, align));
91+
mlir::Value offset = builder.createIntegerConstant(
92+
loc, i32Ty, llvm::alignTo(sharedMemSize, align));
93+
sharedOp.getOffsetMutable().assign(offset);
8894
sharedMemSize =
8995
llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
9096
alignment = std::max(alignment, align);

flang/lib/Optimizer/Transforms/CUFGPUToLLVMConversion.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -232,8 +232,7 @@ struct CUFSharedMemoryOpConversion
232232
loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()),
233233
sharedGlobalAddr);
234234
mlir::Type baseType = castPtr->getResultTypes().front();
235-
llvm::SmallVector<mlir::LLVM::GEPArg> gepArgs = {
236-
static_cast<int32_t>(*op.getOffset())};
235+
llvm::SmallVector<mlir::LLVM::GEPArg> gepArgs = {op.getOffset()};
237236
mlir::Value shmemPtr = rewriter.create<mlir::LLVM::GEPOp>(
238237
loc, baseType, rewriter.getI8Type(), castPtr, gepArgs);
239238
rewriter.replaceOp(op, {shmemPtr});

flang/test/Fir/CUDA/cuda-shared-offset.mlir

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
1414

1515
// CHECK-LABEL: gpu.module @cuda_device_mod
1616
// CHECK: gpu.func @_QPdynshared()
17-
// CHECK: %{{.*}} = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", offset = 0 : i32, uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
17+
// CHECK: %{{.*}} = cuf.shared_memory[%c0{{.*}} : i32] !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
1818
// CHECK: gpu.return
1919
// CHECK: }
2020
// CHECK: fir.global internal @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
@@ -43,12 +43,12 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
4343

4444
// CHECK-LABEL: gpu.module @cuda_device_mod
4545
// CHECK: gpu.func @_QPshared_static()
46-
// CHECK: cuf.shared_memory i32 {bindc_name = "a", offset = 0 : i32, uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
47-
// CHECK: cuf.shared_memory i32 {bindc_name = "b", offset = 4 : i32, uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
48-
// CHECK: cuf.shared_memory i32 {bindc_name = "c", offset = 8 : i32, uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
49-
// CHECK: cuf.shared_memory i32 {bindc_name = "d", offset = 12 : i32, uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
50-
// CHECK: cuf.shared_memory i64 {bindc_name = "e", offset = 16 : i32, uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
51-
// CHECK: cuf.shared_memory f32 {bindc_name = "r", offset = 24 : i32, uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
46+
// CHECK: cuf.shared_memory[%c0{{.*}} : i32] i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
47+
// CHECK: cuf.shared_memory[%c4{{.*}} : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
48+
// CHECK: cuf.shared_memory[%c8{{.*}} : i32] i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
49+
// CHECK: cuf.shared_memory[%c12{{.*}} : i32] i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
50+
// CHECK: cuf.shared_memory[%c16{{.*}} : i32] i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
51+
// CHECK: cuf.shared_memory[%c24{{.*}} : i32] f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
5252
// CHECK: gpu.return
5353
// CHECK: }
5454
// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>

flang/test/Fir/CUDA/cuda-shared-to-llvm.mlir

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
44
gpu.module @cuda_device_mod {
55
llvm.func @_QPshared_static() {
6-
%0 = cuf.shared_memory i32 {bindc_name = "a", offset = 0 : i32, uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
7-
%1 = cuf.shared_memory i32 {bindc_name = "b", offset = 4 : i32, uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
6+
%c0 = arith.constant 0 : i32
7+
%0 = cuf.shared_memory [%c0 : i32] i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
8+
%c4 = arith.constant 4 : i32
9+
%1 = cuf.shared_memory [%c4 : i32] i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
810
llvm.return
911
}
1012
llvm.mlir.global common @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {addr_space = 3 : i32, alignment = 8 : i64} : !llvm.array<28 x i8>
@@ -14,7 +16,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<
1416
// CHECK-LABEL: llvm.func @_QPshared_static()
1517
// CHECK: %[[ADDR0:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3>
1618
// CHECK: %[[ADDRCAST0:.*]] = llvm.addrspacecast %[[ADDR0]] : !llvm.ptr<3> to !llvm.ptr
17-
// CHECK: %[[A:.*]] = llvm.getelementptr %[[ADDRCAST0]][0] : (!llvm.ptr) -> !llvm.ptr, i8
19+
// CHECK: %[[A:.*]] = llvm.getelementptr %[[ADDRCAST0]][%c0{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8
1820
// CHECK: %[[ADDR1:.*]] = llvm.mlir.addressof @_QPshared_static__shared_mem : !llvm.ptr<3>
1921
// CHECK: %[[ADDRCAST1:.*]] = llvm.addrspacecast %[[ADDR1]] : !llvm.ptr<3> to !llvm.ptr
20-
// CHECK: %[[B:.*]] = llvm.getelementptr %[[ADDRCAST1]][4] : (!llvm.ptr) -> !llvm.ptr, i8
22+
// CHECK: %[[B:.*]] = llvm.getelementptr %[[ADDRCAST1]][%c4{{.*}}] : (!llvm.ptr, i32) -> !llvm.ptr, i8

0 commit comments

Comments
 (0)