Skip to content

Commit bd48183

Browse files
committed
[flang][cuda] Compute offset on cuf.shared_memory ops
1 parent 4b4a9e0 commit bd48183

File tree

6 files changed

+198
-0
lines changed

6 files changed

+198
-0
lines changed

flang/include/flang/Optimizer/Builder/CUFCommon.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "mlir/IR/BuiltinOps.h"
1515

1616
static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod";
17+
static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem";
1718

1819
namespace fir {
1920
class FirOpBuilder;

flang/include/flang/Optimizer/Transforms/Passes.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ namespace fir {
4343
#define GEN_PASS_DECL_CUFDEVICEGLOBAL
4444
#define GEN_PASS_DECL_CUFGPUTOLLVMCONVERSION
4545
#define GEN_PASS_DECL_CUFOPCONVERSION
46+
#define GEN_PASS_DECL_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
4647
#define GEN_PASS_DECL_EXTERNALNAMECONVERSION
4748
#define GEN_PASS_DECL_MEMREFDATAFLOWOPT
4849
#define GEN_PASS_DECL_SIMPLIFYINTRINSICS

flang/include/flang/Optimizer/Transforms/Passes.td

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,19 @@ def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> {
453453
];
454454
}
455455

456+
def CUFComputeSharedMemoryOffsetsAndSize
457+
: Pass<"cuf-compute-shared-memory", "mlir::ModuleOp"> {
458+
let summary = "Create the shared memory global variable and set offsets";
459+
460+
let description = [{
461+
Compute the size and alignment of the shared memory global and materialize
462+
it. Compute the offset of each cuf.shared_memory operation according to
463+
the global and set it.
464+
}];
465+
466+
let dependentDialects = ["fir::FIROpsDialect"];
467+
}
468+
456469
def SetRuntimeCallAttributes
457470
: Pass<"set-runtime-call-attrs", "mlir::func::FuncOp"> {
458471
let summary = "Set Fortran runtime fir.call attributes targeting LLVM IR";

flang/lib/Optimizer/Transforms/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ add_flang_library(FIRTransforms
1313
CUFDeviceGlobal.cpp
1414
CUFOpConversion.cpp
1515
CUFGPUToLLVMConversion.cpp
16+
CUFComputeSharedMemoryOffsetsAndSize.cpp
1617
ArrayValueCopy.cpp
1718
ExternalNameConversion.cpp
1819
MemoryUtils.cpp
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "flang/Optimizer/Builder/BoxValue.h"
10+
#include "flang/Optimizer/Builder/CUFCommon.h"
11+
#include "flang/Optimizer/Builder/FIRBuilder.h"
12+
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
13+
#include "flang/Optimizer/Builder/Todo.h"
14+
#include "flang/Optimizer/CodeGen/Target.h"
15+
#include "flang/Optimizer/CodeGen/TypeConverter.h"
16+
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
17+
#include "flang/Optimizer/Dialect/FIRAttr.h"
18+
#include "flang/Optimizer/Dialect/FIRDialect.h"
19+
#include "flang/Optimizer/Dialect/FIROps.h"
20+
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
21+
#include "flang/Optimizer/Dialect/FIRType.h"
22+
#include "flang/Optimizer/Support/DataLayout.h"
23+
#include "flang/Runtime/CUDA/registration.h"
24+
#include "flang/Runtime/entry-names.h"
25+
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
26+
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
27+
#include "mlir/IR/Value.h"
28+
#include "mlir/Pass/Pass.h"
29+
#include "llvm/ADT/SmallVector.h"
30+
31+
namespace fir {
32+
#define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
33+
#include "flang/Optimizer/Transforms/Passes.h.inc"
34+
} // namespace fir
35+
36+
using namespace Fortran::runtime::cuda;
37+
38+
namespace {
39+
40+
struct CUFComputeSharedMemoryOffsetsAndSize
41+
: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
42+
CUFComputeSharedMemoryOffsetsAndSize> {
43+
44+
void runOnOperation() override {
45+
mlir::ModuleOp mod = getOperation();
46+
mlir::SymbolTable symTab(mod);
47+
mlir::OpBuilder opBuilder{mod.getBodyRegion()};
48+
fir::FirOpBuilder builder(opBuilder, mod);
49+
fir::KindMapping kindMap{fir::getKindMapping(mod)};
50+
std::optional<mlir::DataLayout> dl =
51+
fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
52+
if (!dl) {
53+
mlir::emitError(mod.getLoc(),
54+
"data layout attribute is required to perform " +
55+
getName() + "pass");
56+
}
57+
58+
auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
59+
mlir::Type i8Ty = builder.getI8Type();
60+
for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
61+
unsigned nbDynamicSharedVariables = 0;
62+
unsigned nbStaticSharedVariables = 0;
63+
uint64_t sharedMemSize = 0;
64+
unsigned short alignment = 0;
65+
66+
// Go over each shared memory operation and compute their start offset and
67+
// the size and alignment of the global to be generated if all variables
68+
// are static. If this is dynamic shared memory, then only the alignment
69+
// is computed.
70+
for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
71+
if (fir::hasDynamicSize(sharedOp.getInType())) {
72+
mlir::Type ty = sharedOp.getInType();
73+
// getTypeSizeAndAlignmentOrCrash will crash trying to compute the
74+
// size of an array with dynamic size. Just get the alignment to
75+
// create the global.
76+
if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
77+
ty = seqTy.getEleTy();
78+
unsigned short align = dl->getTypeABIAlignment(ty);
79+
++nbDynamicSharedVariables;
80+
sharedOp.setOffset(0);
81+
alignment = std::max(alignment, align);
82+
continue;
83+
}
84+
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
85+
sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
86+
++nbStaticSharedVariables;
87+
sharedOp.setOffset(llvm::alignTo(sharedMemSize, align));
88+
sharedMemSize =
89+
llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
90+
alignment = std::max(alignment, align);
91+
}
92+
if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0)
93+
mlir::emitError(
94+
funcOp.getLoc(),
95+
"static and dynamic shared variables in a single kernel");
96+
97+
mlir::DenseElementsAttr init = {};
98+
if (sharedMemSize > 0) {
99+
auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
100+
mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
101+
init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
102+
}
103+
104+
// Create the shared memory global where each shared variables will point
105+
// to.
106+
auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
107+
std::string sharedMemGlobalName =
108+
(funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
109+
mlir::StringAttr linkage = builder.createInternalLinkage();
110+
builder.setInsertionPointToEnd(gpuMod.getBody());
111+
llvm::SmallVector<mlir::NamedAttribute> attrs;
112+
auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
113+
gpuMod.getContext());
114+
attrs.push_back(mlir::NamedAttribute(
115+
fir::GlobalOp::getDataAttrAttrName(globalOpName),
116+
cuf::DataAttributeAttr::get(gpuMod.getContext(),
117+
cuf::DataAttribute::Shared)));
118+
auto sharedMem = builder.create<fir::GlobalOp>(
119+
funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType,
120+
init, linkage, attrs);
121+
sharedMem.setAlignment(alignment);
122+
}
123+
}
124+
};
125+
126+
} // end anonymous namespace
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// RUN: fir-opt --split-input-file --cuf-compute-shared-memory %s | FileCheck %s
2+
3+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
4+
gpu.module @cuda_device_mod {
5+
gpu.func @_QPdynshared() kernel {
6+
%c-1 = arith.constant -1 : index
7+
%6 = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
8+
%7 = fir.shape %c-1 : (index) -> !fir.shape<1>
9+
%8 = fir.declare %6(%7) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFdynsharedEr"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
10+
gpu.return
11+
}
12+
}
13+
}
14+
15+
// CHECK-LABEL: gpu.module @cuda_device_mod
16+
// CHECK: gpu.func @_QPdynshared()
17+
// CHECK: %{{.*}} = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", offset = 0 : i32, uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
18+
// CHECK: gpu.return
19+
// CHECK: }
20+
// CHECK: fir.global internal @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>
21+
22+
// -----
23+
24+
module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
25+
gpu.module @cuda_device_mod {
26+
gpu.func @_QPshared_static() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
27+
%0 = cuf.shared_memory i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
28+
%1 = fir.declare %0 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEa"} : (!fir.ref<i32>) -> !fir.ref<i32>
29+
%2 = cuf.shared_memory i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
30+
%3 = fir.declare %2 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEb"} : (!fir.ref<i32>) -> !fir.ref<i32>
31+
%8 = cuf.shared_memory i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
32+
%9 = fir.declare %8 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEc"} : (!fir.ref<i32>) -> !fir.ref<i32>
33+
%10 = cuf.shared_memory i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
34+
%11 = fir.declare %10 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEd"} : (!fir.ref<i32>) -> !fir.ref<i32>
35+
%12 = cuf.shared_memory i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
36+
%13 = fir.declare %12 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEe"} : (!fir.ref<i64>) -> !fir.ref<i64>
37+
%16 = cuf.shared_memory f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
38+
%17 = fir.declare %16 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEr"} : (!fir.ref<f32>) -> !fir.ref<f32>
39+
gpu.return
40+
}
41+
}
42+
}
43+
44+
// CHECK-LABEL: gpu.module @cuda_device_mod
45+
// CHECK: gpu.func @_QPshared_static()
46+
// CHECK: cuf.shared_memory i32 {bindc_name = "a", offset = 0 : i32, uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
47+
// CHECK: cuf.shared_memory i32 {bindc_name = "b", offset = 4 : i32, uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
48+
// CHECK: cuf.shared_memory i32 {bindc_name = "c", offset = 8 : i32, uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
49+
// CHECK: cuf.shared_memory i32 {bindc_name = "d", offset = 12 : i32, uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
50+
// CHECK: cuf.shared_memory i64 {bindc_name = "e", offset = 16 : i32, uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
51+
// CHECK: cuf.shared_memory f32 {bindc_name = "r", offset = 24 : i32, uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
52+
// CHECK: gpu.return
53+
// CHECK: }
54+
// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
55+
// CHECK: }
56+
// CHECK: }

0 commit comments

Comments
 (0)