Skip to content

Commit 4fb20b8

Browse files
authored
[flang][cuda] Compute offset on cuf.shared_memory ops (#131395)
Add a pass to compute the size of the shared memory (static shared memory) and the offsets of each variables to be placed in shared memory. The global representing the shared memory is also created during this pass. In case of dynamic shared memory, the global as a type of `!fir.array<0xi8>` and the size of the memory is set at kernel launch.
1 parent 4b1b629 commit 4fb20b8

File tree

6 files changed

+198
-0
lines changed

6 files changed

+198
-0
lines changed

flang/include/flang/Optimizer/Builder/CUFCommon.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "mlir/IR/BuiltinOps.h"
1515

1616
static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod";
17+
static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem";
1718

1819
namespace fir {
1920
class FirOpBuilder;

flang/include/flang/Optimizer/Transforms/Passes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ namespace fir {
4343
#define GEN_PASS_DECL_CUFDEVICEGLOBAL
4444
#define GEN_PASS_DECL_CUFGPUTOLLVMCONVERSION
4545
#define GEN_PASS_DECL_CUFOPCONVERSION
46+
#define GEN_PASS_DECL_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
4647
#define GEN_PASS_DECL_EXTERNALNAMECONVERSION
4748
#define GEN_PASS_DECL_MEMREFDATAFLOWOPT
4849
#define GEN_PASS_DECL_SIMPLIFYINTRINSICS

flang/include/flang/Optimizer/Transforms/Passes.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,19 @@ def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> {
453453
];
454454
}
455455

456+
def CUFComputeSharedMemoryOffsetsAndSize
457+
: Pass<"cuf-compute-shared-memory", "mlir::ModuleOp"> {
458+
let summary = "Create the shared memory global variable and set offsets";
459+
460+
let description = [{
461+
Compute the size and alignment of the shared memory global and materialize
462+
it. Compute the offset of each cuf.shared_memory operation according to
463+
the global and set it.
464+
}];
465+
466+
let dependentDialects = ["fir::FIROpsDialect"];
467+
}
468+
456469
def SetRuntimeCallAttributes
457470
: Pass<"set-runtime-call-attrs", "mlir::func::FuncOp"> {
458471
let summary = "Set Fortran runtime fir.call attributes targeting LLVM IR";

flang/lib/Optimizer/Transforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ add_flang_library(FIRTransforms
1313
CUFDeviceGlobal.cpp
1414
CUFOpConversion.cpp
1515
CUFGPUToLLVMConversion.cpp
16+
CUFComputeSharedMemoryOffsetsAndSize.cpp
1617
ArrayValueCopy.cpp
1718
ExternalNameConversion.cpp
1819
MemoryUtils.cpp
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "flang/Optimizer/Builder/BoxValue.h"
10+
#include "flang/Optimizer/Builder/CUFCommon.h"
11+
#include "flang/Optimizer/Builder/FIRBuilder.h"
12+
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
13+
#include "flang/Optimizer/Builder/Todo.h"
14+
#include "flang/Optimizer/CodeGen/Target.h"
15+
#include "flang/Optimizer/CodeGen/TypeConverter.h"
16+
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
17+
#include "flang/Optimizer/Dialect/FIRAttr.h"
18+
#include "flang/Optimizer/Dialect/FIRDialect.h"
19+
#include "flang/Optimizer/Dialect/FIROps.h"
20+
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
21+
#include "flang/Optimizer/Dialect/FIRType.h"
22+
#include "flang/Optimizer/Support/DataLayout.h"
23+
#include "flang/Runtime/CUDA/registration.h"
24+
#include "flang/Runtime/entry-names.h"
25+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
26+
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
27+
#include "mlir/IR/Value.h"
28+
#include "mlir/Pass/Pass.h"
29+
#include "llvm/ADT/SmallVector.h"
30+
31+
namespace fir {
32+
#define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
33+
#include "flang/Optimizer/Transforms/Passes.h.inc"
34+
} // namespace fir
35+
36+
using namespace Fortran::runtime::cuda;
37+
38+
namespace {
39+
40+
struct CUFComputeSharedMemoryOffsetsAndSize
41+
: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
42+
CUFComputeSharedMemoryOffsetsAndSize> {
43+
44+
void runOnOperation() override {
45+
mlir::ModuleOp mod = getOperation();
46+
mlir::SymbolTable symTab(mod);
47+
mlir::OpBuilder opBuilder{mod.getBodyRegion()};
48+
fir::FirOpBuilder builder(opBuilder, mod);
49+
fir::KindMapping kindMap{fir::getKindMapping(mod)};
50+
std::optional<mlir::DataLayout> dl =
51+
fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
52+
if (!dl) {
53+
mlir::emitError(mod.getLoc(),
54+
"data layout attribute is required to perform " +
55+
getName() + "pass");
56+
}
57+
58+
auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
59+
mlir::Type i8Ty = builder.getI8Type();
60+
for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
61+
unsigned nbDynamicSharedVariables = 0;
62+
unsigned nbStaticSharedVariables = 0;
63+
uint64_t sharedMemSize = 0;
64+
unsigned short alignment = 0;
65+
66+
// Go over each shared memory operation and compute their start offset and
67+
// the size and alignment of the global to be generated if all variables
68+
// are static. If this is dynamic shared memory, then only the alignment
69+
// is computed.
70+
for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
71+
if (fir::hasDynamicSize(sharedOp.getInType())) {
72+
mlir::Type ty = sharedOp.getInType();
73+
// getTypeSizeAndAlignmentOrCrash will crash trying to compute the
74+
// size of an array with dynamic size. Just get the alignment to
75+
// create the global.
76+
if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
77+
ty = seqTy.getEleTy();
78+
unsigned short align = dl->getTypeABIAlignment(ty);
79+
++nbDynamicSharedVariables;
80+
sharedOp.setOffset(0);
81+
alignment = std::max(alignment, align);
82+
continue;
83+
}
84+
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
85+
sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
86+
++nbStaticSharedVariables;
87+
sharedOp.setOffset(llvm::alignTo(sharedMemSize, align));
88+
sharedMemSize =
89+
llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
90+
alignment = std::max(alignment, align);
91+
}
92+
if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0)
93+
mlir::emitError(
94+
funcOp.getLoc(),
95+
"static and dynamic shared variables in a single kernel");
96+
97+
mlir::DenseElementsAttr init = {};
98+
if (sharedMemSize > 0) {
99+
auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
100+
mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
101+
init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
102+
}
103+
104+
// Create the shared memory global where each shared variable will point
105+
// to.
106+
auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
107+
std::string sharedMemGlobalName =
108+
(funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
109+
mlir::StringAttr linkage = builder.createInternalLinkage();
110+
builder.setInsertionPointToEnd(gpuMod.getBody());
111+
llvm::SmallVector<mlir::NamedAttribute> attrs;
112+
auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
113+
gpuMod.getContext());
114+
attrs.push_back(mlir::NamedAttribute(
115+
fir::GlobalOp::getDataAttrAttrName(globalOpName),
116+
cuf::DataAttributeAttr::get(gpuMod.getContext(),
117+
cuf::DataAttribute::Shared)));
118+
auto sharedMem = builder.create<fir::GlobalOp>(
119+
funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType,
120+
init, linkage, attrs);
121+
sharedMem.setAlignment(alignment);
122+
}
123+
}
124+
};
125+
126+
} // end anonymous namespace
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// RUN: fir-opt --split-input-file --cuf-compute-shared-memory %s | FileCheck %s
2+
3+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
4+
gpu.module @cuda_device_mod {
5+
gpu.func @_QPdynshared() kernel {
6+
%c-1 = arith.constant -1 : index
7+
%6 = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
8+
%7 = fir.shape %c-1 : (index) -> !fir.shape<1>
9+
%8 = fir.declare %6(%7) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFdynsharedEr"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
10+
gpu.return
11+
}
12+
}
13+
}
14+
15+
// CHECK-LABEL: gpu.module @cuda_device_mod
16+
// CHECK: gpu.func @_QPdynshared()
17+
// CHECK: %{{.*}} = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", offset = 0 : i32, uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
18+
// CHECK: gpu.return
19+
// CHECK: }
20+
// CHECK: fir.global internal @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
21+
22+
// -----
23+
24+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
25+
gpu.module @cuda_device_mod {
26+
gpu.func @_QPshared_static() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
27+
%0 = cuf.shared_memory i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
28+
%1 = fir.declare %0 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEa"} : (!fir.ref<i32>) -> !fir.ref<i32>
29+
%2 = cuf.shared_memory i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
30+
%3 = fir.declare %2 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEb"} : (!fir.ref<i32>) -> !fir.ref<i32>
31+
%8 = cuf.shared_memory i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
32+
%9 = fir.declare %8 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEc"} : (!fir.ref<i32>) -> !fir.ref<i32>
33+
%10 = cuf.shared_memory i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
34+
%11 = fir.declare %10 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEd"} : (!fir.ref<i32>) -> !fir.ref<i32>
35+
%12 = cuf.shared_memory i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
36+
%13 = fir.declare %12 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEe"} : (!fir.ref<i64>) -> !fir.ref<i64>
37+
%16 = cuf.shared_memory f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
38+
%17 = fir.declare %16 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEr"} : (!fir.ref<f32>) -> !fir.ref<f32>
39+
gpu.return
40+
}
41+
}
42+
}
43+
44+
// CHECK-LABEL: gpu.module @cuda_device_mod
45+
// CHECK: gpu.func @_QPshared_static()
46+
// CHECK: cuf.shared_memory i32 {bindc_name = "a", offset = 0 : i32, uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
47+
// CHECK: cuf.shared_memory i32 {bindc_name = "b", offset = 4 : i32, uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
48+
// CHECK: cuf.shared_memory i32 {bindc_name = "c", offset = 8 : i32, uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
49+
// CHECK: cuf.shared_memory i32 {bindc_name = "d", offset = 12 : i32, uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
50+
// CHECK: cuf.shared_memory i64 {bindc_name = "e", offset = 16 : i32, uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
51+
// CHECK: cuf.shared_memory f32 {bindc_name = "r", offset = 24 : i32, uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
52+
// CHECK: gpu.return
53+
// CHECK: }
54+
// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
55+
// CHECK: }
56+
// CHECK: }

0 commit comments

Comments
 (0)