Skip to content

[flang][cuda] Compute offset on cuf.shared_memory ops #131395

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions flang/include/flang/Optimizer/Builder/CUFCommon.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "mlir/IR/BuiltinOps.h"

static constexpr llvm::StringRef cudaDeviceModuleName = "cuda_device_mod";
static constexpr llvm::StringRef cudaSharedMemSuffix = "__shared_mem";

namespace fir {
class FirOpBuilder;
Expand Down
1 change: 1 addition & 0 deletions flang/include/flang/Optimizer/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ namespace fir {
#define GEN_PASS_DECL_CUFDEVICEGLOBAL
#define GEN_PASS_DECL_CUFGPUTOLLVMCONVERSION
#define GEN_PASS_DECL_CUFOPCONVERSION
#define GEN_PASS_DECL_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
#define GEN_PASS_DECL_EXTERNALNAMECONVERSION
#define GEN_PASS_DECL_MEMREFDATAFLOWOPT
#define GEN_PASS_DECL_SIMPLIFYINTRINSICS
Expand Down
13 changes: 13 additions & 0 deletions flang/include/flang/Optimizer/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -453,6 +453,19 @@ def CUFGPUToLLVMConversion : Pass<"cuf-gpu-convert-to-llvm", "mlir::ModuleOp"> {
];
}

def CUFComputeSharedMemoryOffsetsAndSize
: Pass<"cuf-compute-shared-memory", "mlir::ModuleOp"> {
let summary = "Create the shared memory global variable and set offsets";

let description = [{
Compute the size and alignment of the shared memory global and materialize
it. Compute the offset of each cuf.shared_memory operation according to
the global and set it.
}];

let dependentDialects = ["fir::FIROpsDialect"];
}

def SetRuntimeCallAttributes
: Pass<"set-runtime-call-attrs", "mlir::func::FuncOp"> {
let summary = "Set Fortran runtime fir.call attributes targeting LLVM IR";
Expand Down
1 change: 1 addition & 0 deletions flang/lib/Optimizer/Transforms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ add_flang_library(FIRTransforms
CUFDeviceGlobal.cpp
CUFOpConversion.cpp
CUFGPUToLLVMConversion.cpp
CUFComputeSharedMemoryOffsetsAndSize.cpp
ArrayValueCopy.cpp
ExternalNameConversion.cpp
MemoryUtils.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
//===-- CUFComputeSharedMemoryOffsetsAndSize.cpp --------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "flang/Optimizer/Builder/BoxValue.h"
#include "flang/Optimizer/Builder/CUFCommon.h"
#include "flang/Optimizer/Builder/FIRBuilder.h"
#include "flang/Optimizer/Builder/Runtime/RTBuilder.h"
#include "flang/Optimizer/Builder/Todo.h"
#include "flang/Optimizer/CodeGen/Target.h"
#include "flang/Optimizer/CodeGen/TypeConverter.h"
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
#include "flang/Optimizer/Dialect/FIRAttr.h"
#include "flang/Optimizer/Dialect/FIRDialect.h"
#include "flang/Optimizer/Dialect/FIROps.h"
#include "flang/Optimizer/Dialect/FIROpsSupport.h"
#include "flang/Optimizer/Dialect/FIRType.h"
#include "flang/Optimizer/Support/DataLayout.h"
#include "flang/Runtime/CUDA/registration.h"
#include "flang/Runtime/entry-names.h"
#include "mlir/Dialect/GPU/IR/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/IR/Value.h"
#include "mlir/Pass/Pass.h"
#include "llvm/ADT/SmallVector.h"

namespace fir {
#define GEN_PASS_DEF_CUFCOMPUTESHAREDMEMORYOFFSETSANDSIZE
#include "flang/Optimizer/Transforms/Passes.h.inc"
} // namespace fir

using namespace Fortran::runtime::cuda;

namespace {

struct CUFComputeSharedMemoryOffsetsAndSize
: public fir::impl::CUFComputeSharedMemoryOffsetsAndSizeBase<
CUFComputeSharedMemoryOffsetsAndSize> {

void runOnOperation() override {
mlir::ModuleOp mod = getOperation();
mlir::SymbolTable symTab(mod);
mlir::OpBuilder opBuilder{mod.getBodyRegion()};
fir::FirOpBuilder builder(opBuilder, mod);
fir::KindMapping kindMap{fir::getKindMapping(mod)};
std::optional<mlir::DataLayout> dl =
fir::support::getOrSetMLIRDataLayout(mod, /*allowDefaultLayout=*/false);
if (!dl) {
mlir::emitError(mod.getLoc(),
"data layout attribute is required to perform " +
getName() + "pass");
}

auto gpuMod = cuf::getOrCreateGPUModule(mod, symTab);
mlir::Type i8Ty = builder.getI8Type();
for (auto funcOp : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
unsigned nbDynamicSharedVariables = 0;
unsigned nbStaticSharedVariables = 0;
uint64_t sharedMemSize = 0;
unsigned short alignment = 0;

// Go over each shared memory operation and compute their start offset and
// the size and alignment of the global to be generated if all variables
// are static. If this is dynamic shared memory, then only the alignment
// is computed.
for (auto sharedOp : funcOp.getOps<cuf::SharedMemoryOp>()) {
if (fir::hasDynamicSize(sharedOp.getInType())) {
mlir::Type ty = sharedOp.getInType();
// getTypeSizeAndAlignmentOrCrash will crash trying to compute the
// size of an array with dynamic size. Just get the alignment to
// create the global.
if (auto seqTy = mlir::dyn_cast<fir::SequenceType>(ty))
ty = seqTy.getEleTy();
unsigned short align = dl->getTypeABIAlignment(ty);
++nbDynamicSharedVariables;
sharedOp.setOffset(0);
alignment = std::max(alignment, align);
continue;
}
auto [size, align] = fir::getTypeSizeAndAlignmentOrCrash(
sharedOp.getLoc(), sharedOp.getInType(), *dl, kindMap);
++nbStaticSharedVariables;
sharedOp.setOffset(llvm::alignTo(sharedMemSize, align));
sharedMemSize =
llvm::alignTo(sharedMemSize, align) + llvm::alignTo(size, align);
alignment = std::max(alignment, align);
}
if (nbDynamicSharedVariables > 0 && nbStaticSharedVariables > 0)
mlir::emitError(
funcOp.getLoc(),
"static and dynamic shared variables in a single kernel");

mlir::DenseElementsAttr init = {};
if (sharedMemSize > 0) {
auto vecTy = mlir::VectorType::get(sharedMemSize, i8Ty);
mlir::Attribute zero = mlir::IntegerAttr::get(i8Ty, 0);
init = mlir::DenseElementsAttr::get(vecTy, llvm::ArrayRef(zero));
}

// Create the shared memory global where each shared variable will point
// to.
auto sharedMemType = fir::SequenceType::get(sharedMemSize, i8Ty);
std::string sharedMemGlobalName =
(funcOp.getName() + llvm::Twine(cudaSharedMemSuffix)).str();
mlir::StringAttr linkage = builder.createInternalLinkage();
builder.setInsertionPointToEnd(gpuMod.getBody());
llvm::SmallVector<mlir::NamedAttribute> attrs;
auto globalOpName = mlir::OperationName(fir::GlobalOp::getOperationName(),
gpuMod.getContext());
attrs.push_back(mlir::NamedAttribute(
fir::GlobalOp::getDataAttrAttrName(globalOpName),
cuf::DataAttributeAttr::get(gpuMod.getContext(),
cuf::DataAttribute::Shared)));
auto sharedMem = builder.create<fir::GlobalOp>(
funcOp.getLoc(), sharedMemGlobalName, false, false, sharedMemType,
init, linkage, attrs);
sharedMem.setAlignment(alignment);
}
}
};

} // end anonymous namespace
56 changes: 56 additions & 0 deletions flang/test/Fir/CUDA/cuda-shared-offset.mlir
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// RUN: fir-opt --split-input-file --cuf-compute-shared-memory %s | FileCheck %s

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
gpu.module @cuda_device_mod {
gpu.func @_QPdynshared() kernel {
%c-1 = arith.constant -1 : index
%6 = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
%7 = fir.shape %c-1 : (index) -> !fir.shape<1>
%8 = fir.declare %6(%7) {data_attr = #cuf.cuda<shared>, uniq_name = "_QFdynsharedEr"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>) -> !fir.ref<!fir.array<?xf32>>
gpu.return
}
}
}

// CHECK-LABEL: gpu.module @cuda_device_mod
// CHECK: gpu.func @_QPdynshared()
// CHECK: %{{.*}} = cuf.shared_memory !fir.array<?xf32>, %c-1 : index {bindc_name = "r", offset = 0 : i32, uniq_name = "_QFdynsharedEr"} -> !fir.ref<!fir.array<?xf32>>
// CHECK: gpu.return
// CHECK: }
// CHECK: fir.global internal @_QPdynshared__shared_mem {alignment = 4 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<0xi8>

// -----

module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", gpu.container_module, llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.ident = "flang version 20.0.0 (https://github.com/llvm/llvm-project.git cae351f3453a0a26ec8eb2ddaf773c24a29d929e)", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
gpu.module @cuda_device_mod {
gpu.func @_QPshared_static() attributes {cuf.proc_attr = #cuf.cuda_proc<global>} {
%0 = cuf.shared_memory i32 {bindc_name = "a", uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
%1 = fir.declare %0 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEa"} : (!fir.ref<i32>) -> !fir.ref<i32>
%2 = cuf.shared_memory i32 {bindc_name = "b", uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
%3 = fir.declare %2 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEb"} : (!fir.ref<i32>) -> !fir.ref<i32>
%8 = cuf.shared_memory i32 {bindc_name = "c", uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
%9 = fir.declare %8 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEc"} : (!fir.ref<i32>) -> !fir.ref<i32>
%10 = cuf.shared_memory i32 {bindc_name = "d", uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
%11 = fir.declare %10 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEd"} : (!fir.ref<i32>) -> !fir.ref<i32>
%12 = cuf.shared_memory i64 {bindc_name = "e", uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
%13 = fir.declare %12 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEe"} : (!fir.ref<i64>) -> !fir.ref<i64>
%16 = cuf.shared_memory f32 {bindc_name = "r", uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
%17 = fir.declare %16 {data_attr = #cuf.cuda<shared>, uniq_name = "_QFshared_staticEr"} : (!fir.ref<f32>) -> !fir.ref<f32>
gpu.return
}
}
}

// CHECK-LABEL: gpu.module @cuda_device_mod
// CHECK: gpu.func @_QPshared_static()
// CHECK: cuf.shared_memory i32 {bindc_name = "a", offset = 0 : i32, uniq_name = "_QFshared_staticEa"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory i32 {bindc_name = "b", offset = 4 : i32, uniq_name = "_QFshared_staticEb"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory i32 {bindc_name = "c", offset = 8 : i32, uniq_name = "_QFshared_staticEc"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory i32 {bindc_name = "d", offset = 12 : i32, uniq_name = "_QFshared_staticEd"} -> !fir.ref<i32>
// CHECK: cuf.shared_memory i64 {bindc_name = "e", offset = 16 : i32, uniq_name = "_QFshared_staticEe"} -> !fir.ref<i64>
// CHECK: cuf.shared_memory f32 {bindc_name = "r", offset = 24 : i32, uniq_name = "_QFshared_staticEr"} -> !fir.ref<f32>
// CHECK: gpu.return
// CHECK: }
// CHECK: fir.global internal @_QPshared_static__shared_mem(dense<0> : vector<28xi8>) {alignment = 8 : i64, data_attr = #cuf.cuda<shared>} : !fir.array<28xi8>
// CHECK: }
// CHECK: }