Skip to content

[mlir][gpu] Introduce gpu.dynamic_shared_memory Op #71546

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Nov 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions mlir/include/mlir/Dialect/GPU/IR/GPUBase.td
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ def GPU_Dialect : Dialect {
/// Returns the numeric value used to identify the private memory address
/// space.
static AddressSpace getPrivateAddressSpace() { return AddressSpace::Private; }

/// Return true if the given MemRefType has an address space that matches
/// with the gpu::AddressSpaceAttr attribute with value 'workgroup`.
static bool hasWorkgroupMemoryAddressSpace(MemRefType type);

/// Return true if the given Attribute is an gpu::AddressSpaceAttr
/// attribute with value 'workgroup`.
static bool isWorkgroupMemoryAddressSpace(Attribute memorySpace);
}];

let dependentDialects = ["arith::ArithDialect"];
Expand Down
26 changes: 26 additions & 0 deletions mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,32 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
let hasVerifier = 1;
}

def GPU_DynamicSharedMemoryOp : GPU_Op<"dynamic_shared_memory", [Pure]>
{
let summary = "Get the memref for dynamic shared memory";

let description = [{
This operation provides a memref pointer to the start of dynamic shared
memory, often referred to as workgroup memory. It's important to note that
this dynamic shared memory needs to be allocated at kernel launch. One can
conveniently utilize `the dynamic_shared_memory_size` parameter of
`gpu.launch` for this purpose.

Examples:
```mlir
%0 = gpu.dynamic.shared.memory : memref<?xi8, #gpu.address_space<workgroup>>
%1 = memref.view %0[%c8192][] : memref<?xi8, #gpu.address_space<workgroup>>
to memref<32x64xf32, #gpu.address_space<workgroup>>
%2 = memref.view %0[%c16384][] : memref<?xi8, #gpu.address_space<workgroup>>
to memref<32x64xf32, #gpu.address_space<workgroup>>
```
}];
let arguments = (ins);
let results = (outs Arg<MemRefRankOf<[I8], [1]>>:$resultMemref);
let assemblyFormat = [{ attr-dict `:` type($resultMemref) }];
let hasVerifier = 1;
}

def LaunchIndx : AnyTypeOf<[Index, I32, I64]>;

def GPU_LaunchFuncOp :GPU_Op<"launch_func", [
Expand Down
3 changes: 3 additions & 0 deletions mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
namespace mlir {
namespace NVVM {

// Shared memory has 128-bit alignment
constexpr int kSharedMemoryAlignmentBit = 128;

/// NVVM memory space identifiers.
enum NVVMMemorySpace {
/// Global memory space identifier.
Expand Down
18 changes: 18 additions & 0 deletions mlir/include/mlir/IR/SymbolTable.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,24 @@ class SymbolTable {
Nested,
};

/// Generate a unique symbol name. Iteratively increase uniquingCounter
/// and use it as a suffix for symbol names until uniqueChecker does not
/// detect any conflict.
template <unsigned N, typename UniqueChecker>
static SmallString<N> generateSymbolName(StringRef name,
UniqueChecker uniqueChecker,
unsigned &uniquingCounter) {
SmallString<N> nameBuffer(name);
unsigned originalLength = nameBuffer.size();
do {
nameBuffer.resize(originalLength);
nameBuffer += '_';
nameBuffer += std::to_string(uniquingCounter++);
} while (uniqueChecker(nameBuffer));

return nameBuffer;
}

/// Returns the name of the given symbol operation, aborting if no symbol is
/// present.
static StringAttr getSymbolName(Operation *symbol);
Expand Down
99 changes: 99 additions & 0 deletions mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "mlir/IR/Builders.h"
#include "mlir/IR/BuiltinTypes.h"
#include "llvm/ADT/SmallVectorExtras.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Support/FormatVariadic.h"

using namespace mlir;
Expand Down Expand Up @@ -554,6 +555,104 @@ static IntegerAttr wrapNumericMemorySpace(MLIRContext *ctx, unsigned space) {
return IntegerAttr::get(IntegerType::get(ctx, 64), space);
}

/// Generates a symbol with 0-sized array type for dynamic shared memory usage,
/// or uses existing symbol.
LLVM::GlobalOp
getDynamicSharedMemorySymbol(ConversionPatternRewriter &rewriter,
Operation *moduleOp, gpu::DynamicSharedMemoryOp op,
const LLVMTypeConverter *typeConverter,
MemRefType memrefType, unsigned alignmentBit) {
uint64_t alignmentByte = alignmentBit / memrefType.getElementTypeBitWidth();

FailureOr<unsigned> addressSpace =
typeConverter->getMemRefAddressSpace(memrefType);
if (failed(addressSpace)) {
op->emitError() << "conversion of memref memory space "
<< memrefType.getMemorySpace()
<< " to integer address space "
"failed. Consider adding memory space conversions.";
}

// Step 1. Collect symbol names of LLVM::GlobalOp Ops. Also if any of
// LLVM::GlobalOp is suitable for shared memory, return it.
llvm::StringSet<> existingGlobalNames;
for (auto globalOp :
moduleOp->getRegion(0).front().getOps<LLVM::GlobalOp>()) {
existingGlobalNames.insert(globalOp.getSymName());
if (auto arrayType = dyn_cast<LLVM::LLVMArrayType>(globalOp.getType())) {
if (globalOp.getAddrSpace() == addressSpace.value() &&
arrayType.getNumElements() == 0 &&
globalOp.getAlignment().value_or(0) == alignmentByte) {
return globalOp;
}
}
}

// Step 2. Find a unique symbol name
unsigned uniquingCounter = 0;
SmallString<128> symName = SymbolTable::generateSymbolName<128>(
"__dynamic_shmem_",
[&](StringRef candidate) {
return existingGlobalNames.contains(candidate);
},
uniquingCounter);

// Step 3. Generate a global op
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(&moduleOp->getRegion(0).front().front());

auto zeroSizedArrayType = LLVM::LLVMArrayType::get(
typeConverter->convertType(memrefType.getElementType()), 0);

return rewriter.create<LLVM::GlobalOp>(
op->getLoc(), zeroSizedArrayType, /*isConstant=*/false,
LLVM::Linkage::Internal, symName, /*value=*/Attribute(), alignmentByte,
addressSpace.value());
}

LogicalResult GPUDynamicSharedMemoryOpLowering::matchAndRewrite(
gpu::DynamicSharedMemoryOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
MemRefType memrefType = op.getResultMemref().getType();
Type elementType = typeConverter->convertType(memrefType.getElementType());

// Step 1: Generate a memref<0xi8> type
MemRefLayoutAttrInterface layout = {};
auto memrefType0sz =
MemRefType::get({0}, elementType, layout, memrefType.getMemorySpace());

// Step 2: Generate a global symbol or existing for the dynamic shared
// memory with memref<0xi8> type
LLVM::LLVMFuncOp funcOp = op->getParentOfType<LLVM::LLVMFuncOp>();
LLVM::GlobalOp shmemOp = {};
Operation *moduleOp = funcOp->getParentWithTrait<OpTrait::SymbolTable>();
shmemOp = getDynamicSharedMemorySymbol(
rewriter, moduleOp, op, getTypeConverter(), memrefType0sz, alignmentBit);

// Step 3. Get address of the global symbol
OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(op);
auto basePtr = rewriter.create<LLVM::AddressOfOp>(loc, shmemOp);
Type baseType = basePtr->getResultTypes().front();

// Step 4. Generate GEP using offsets
SmallVector<LLVM::GEPArg> gepArgs = {0};
Value shmemPtr = rewriter.create<LLVM::GEPOp>(loc, baseType, elementType,
basePtr, gepArgs);
// Step 5. Create a memref descriptor
SmallVector<Value> shape, strides;
Value sizeBytes;
getMemRefDescriptorSizes(loc, memrefType0sz, {}, rewriter, shape, strides,
sizeBytes);
auto memRefDescriptor = this->createMemRefDescriptor(
loc, memrefType0sz, shmemPtr, shmemPtr, shape, strides, rewriter);

// Step 5. Replace the op with memref descriptor
rewriter.replaceOp(op, {memRefDescriptor});
return success();
}

void mlir::populateGpuMemorySpaceAttributeConversions(
TypeConverter &typeConverter, const MemorySpaceMapping &mapping) {
typeConverter.addTypeAttributeConversion(
Expand Down
21 changes: 21 additions & 0 deletions mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,27 @@

namespace mlir {

/// Lowering for gpu.dynamic.shared.memory to LLVM dialect. The pattern first
/// create a 0-sized global array symbol similar as LLVM expects. It constructs
/// a memref descriptor with these values and return it.
struct GPUDynamicSharedMemoryOpLowering
: public ConvertOpToLLVMPattern<gpu::DynamicSharedMemoryOp> {
using ConvertOpToLLVMPattern<
gpu::DynamicSharedMemoryOp>::ConvertOpToLLVMPattern;
GPUDynamicSharedMemoryOpLowering(const LLVMTypeConverter &converter,
unsigned alignmentBit = 0)
: ConvertOpToLLVMPattern<gpu::DynamicSharedMemoryOp>(converter),
alignmentBit(alignmentBit) {}

LogicalResult
matchAndRewrite(gpu::DynamicSharedMemoryOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;

private:
// Alignment bit
unsigned alignmentBit;
};

struct GPUFuncOpLowering : ConvertOpToLLVMPattern<gpu::GPUFuncOp> {
GPUFuncOpLowering(const LLVMTypeConverter &converter,
unsigned allocaAddrSpace, unsigned workgroupAddrSpace,
Expand Down
3 changes: 3 additions & 0 deletions mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,9 @@ void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(
converter);

patterns.add<GPUDynamicSharedMemoryOpLowering>(
converter, NVVM::kSharedMemoryAlignmentBit);

// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).
Expand Down
44 changes: 34 additions & 10 deletions mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include "mlir/IR/Matchers.h"
#include "mlir/IR/OpImplementation.h"
#include "mlir/IR/PatternMatch.h"
#include "mlir/IR/SymbolTable.h"
#include "mlir/IR/TypeUtilities.h"
#include "mlir/Interfaces/FunctionImplementation.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
Expand Down Expand Up @@ -164,17 +165,18 @@ MMAMatrixType::verify(function_ref<InFlightDiagnostic()> emitError,
// GPUDialect
//===----------------------------------------------------------------------===//

/// GPU memory space identifiers.
enum GPUMemorySpace {
/// Generic memory space identifier.
kGenericMemorySpace = 0,

/// Global memory space identifier.
kGlobalMemorySpace = 1,
bool GPUDialect::isWorkgroupMemoryAddressSpace(Attribute memorySpace) {
if (!memorySpace)
return false;
if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
return gpuAttr.getValue() == getWorkgroupAddressSpace();
return false;
}

/// Shared memory space identifier.
kSharedMemorySpace = 3
};
bool GPUDialect::hasWorkgroupMemoryAddressSpace(MemRefType type) {
Attribute memorySpace = type.getMemorySpace();
return isWorkgroupMemoryAddressSpace(memorySpace);
}

bool GPUDialect::isKernel(Operation *op) {
UnitAttr isKernelAttr = op->getAttrOfType<UnitAttr>(getKernelFuncAttrName());
Expand Down Expand Up @@ -2024,6 +2026,28 @@ gpu::SelectObjectAttr::verify(function_ref<InFlightDiagnostic()> emitError,
return success();
}

//===----------------------------------------------------------------------===//
// DynamicSharedMemoryOp
//===----------------------------------------------------------------------===//

LogicalResult gpu::DynamicSharedMemoryOp::verify() {
if (!getOperation()->getParentWithTrait<OpTrait::SymbolTable>())
return emitOpError() << "must be inside an op with symbol table";

MemRefType memrefType = getResultMemref().getType();
// Check address space
if (!GPUDialect::hasWorkgroupMemoryAddressSpace(memrefType)) {
return emitOpError() << "address space must be "
<< gpu::AddressSpaceAttr::getMnemonic() << "<"
<< stringifyEnum(gpu::AddressSpace::Workgroup) << ">";
}
if (memrefType.hasStaticShape()) {
return emitOpError() << "result memref type must be memref<?xi8, "
"#gpu.address_space<workgroup>>";
}
return success();
}

//===----------------------------------------------------------------------===//
// GPU target options
//===----------------------------------------------------------------------===//
Expand Down
20 changes: 8 additions & 12 deletions mlir/lib/IR/SymbolTable.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,20 +200,16 @@ StringAttr SymbolTable::insert(Operation *symbol, Block::iterator insertPt) {
// If the symbol was already in the table, also return.
if (symbolTable.lookup(name) == symbol)
return name;
// If a conflict was detected, then the symbol will not have been added to
// the symbol table. Try suffixes until we get to a unique name that works.
SmallString<128> nameBuffer(name.getValue());
unsigned originalLength = nameBuffer.size();

MLIRContext *context = symbol->getContext();

// Iteratively try suffixes until we find one that isn't used.
do {
nameBuffer.resize(originalLength);
nameBuffer += '_';
nameBuffer += std::to_string(uniquingCounter++);
} while (!symbolTable.insert({StringAttr::get(context, nameBuffer), symbol})
.second);
SmallString<128> nameBuffer = generateSymbolName<128>(
name.getValue(),
[&](StringRef candidate) {
return !symbolTable
.insert({StringAttr::get(context, candidate), symbol})
.second;
},
uniquingCounter);
setSymbolName(symbol, nameBuffer);
return getSymbolName(symbol);
}
Expand Down
Loading