Skip to content

[mlir][AMDGPU] Plumb address space 7 through MLIR, add address_space attr. #125594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 26, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 12 additions & 4 deletions mlir/include/mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,26 @@ namespace mlir {

class LLVMTypeConverter;
class RewritePatternSet;
class TypeConverter;
class Pass;

#define GEN_PASS_DECL_CONVERTAMDGPUTOROCDLPASS
#include "mlir/Conversion/Passes.h.inc"

/// Note: The ROCDL target does not support the LLVM bfloat type at this time
/// and so this function will add conversions to change all `bfloat` uses
/// to `i16`.
void populateAMDGPUToROCDLConversionPatterns(const LLVMTypeConverter &converter,
/// Note: This function will also add conversions for the AMDGPU-specific
/// address spaces, but those can be added separately using
/// populateAMDGPUMemorySpaceAttributeConversions().
void populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
RewritePatternSet &patterns,
amdgpu::Chipset chipset);

/// Remap AMDGPU memory spaces to LLVM address spaces
/// by mapping amdgpu::AddressSpace::fat_raw_buffer to ptr addrspace(7),
/// amdgpu::AddressSpace::buffer_rsrc to ptr addrspace(8), and
/// amdgpu::AddressSpace::fat_strided_buffer to ptr addrspace(9).
void populateAMDGPUMemorySpaceAttributeConversions(
TypeConverter &typeConverter);

} // namespace mlir

#endif // MLIR_CONVERSION_AMDGPUTOROCDL_AMDGPUTOROCDL_H_
105 changes: 105 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,11 @@
#ifndef AMDGPU
#define AMDGPU

include "mlir/Interfaces/InferTypeOpInterface.td"
include "mlir/Interfaces/SideEffectInterfaces.td"
include "mlir/Interfaces/ViewLikeInterface.td"
include "mlir/IR/EnumAttr.td"
include "mlir/IR/Properties.td"
include "mlir/IR/OpBase.td"

def AMDGPU_Dialect : Dialect {
Expand All @@ -32,6 +35,45 @@ def AMDGPU_Dialect : Dialect {
let useDefaultAttributePrinterParser = 1;
}

//===----------------------------------------------------------------------===//
// AMDGPU general attribute definitions
//===----------------------------------------------------------------------===//

def AMDGPU_AddressSpace : I32EnumAttr<"AddressSpace",
"AMDGPU-specific address spaces",
[
I32EnumAttrCase<"FatRawBuffer", 0, "fat_raw_buffer">,
I32EnumAttrCase<"BufferRsrc", 1, "buffer_rsrc">,
I32EnumAttrCase<"FatStructuredBuffer", 2, "fat_structured_buffer">,
]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::amdgpu";
}

def AMDGPU_AddressSpaceAttr : EnumAttr<AMDGPU_Dialect, AMDGPU_AddressSpace,
"address_space"> {
let description = [{
AMDGPU-specific memory spaces that may not have exact analogues on other
GPU targets or backends.

- `fat_raw_buffer` is the memory space used when a memref is stored as
as a "buffer fat pointer" - that is, a buffer resource (that is set up to
use raw byte-level indexing) along with its offset. The AMDGPU backend
implements `ptr addrspace(7)` to represent these fat pointers so that
buffer resources (which allow advanced features like bounds checking or
cache swizzling) can be used like ordinary LLVM pointers or memrefs.
See also the `fat_raw_buffer_cast` operation
- `buffer_rsrc` is the memory space for `ptr addrspace(8)`, representing a
buffer resource. It should not be used for memrefs, since it does not support
indexing
- `fat_structured_buffer` represents `ptr addrspace(9)`, a buffer resource
that carries both an index and offset field, which are used for complex
structured indexing that is primarily seen in graphics applications. This
is also incompatible with the simple indexing model supported by memref.
}];
let assemblyFormat = "`<` $value `>`";
}

//===----------------------------------------------------------------------===//
// AMDGPU Op definitions
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -118,6 +160,69 @@ def AMDGPU_PackedStochRoundFp8Op :
let hasVerifier = 1;
}

def AMDGPU_FatRawBufferCastOp :
AMDGPU_Op<"fat_raw_buffer_cast",
[Pure,
DeclareOpInterfaceMethods<InferTypeOpInterface>,
ViewLikeOpInterface, AttrSizedOperandSegments]>,
Arguments<(ins AnyMemRef:$source,
Optional<I32>:$validBytes,
Optional<I<14>>:$cacheSwizzleStride,
DefaultValuedProp<BoolProp, "true">:$boundsCheck,
UnitProp:$resetOffset)>,
Results<(outs AnyMemRef:$result)> {
let summary = "Create a raw buffer fat pointer that matches `memref`";
let description = [{
Wraps the memory pointed to by `source` as a raw buffer fat pointer, or,
in LLVM terms, a `ptr addrspace(7)`, returning a memref that has the same
sizes and layout but the `#amdgpu.address_space<fat_raw_buffer>`
address space.

This memref can be used with standard memref operations like `memref.load`,
`memref.store`, and `memref.atomicrmw`, which will be lowered to the relevant
buffer intrinsics. (`vector.masked_load/store` will work once there's backend
support for lowering them, and then this document will be updated)

If `validBytes` is given, it is the number of bytes that will be valid as
an offset to `out`. If it is not provided, this will be inferred from
the size of the memref during lowering. This size is
max_{d = 0 upto rank(source)} (sizes[d] * strides[d]) * sizeof(element type).

The flags of the buffer descriptor will be set up to enable raw usage -
for example, stride = 0, add_tid = 0, and so on. The `boundsCheck`
property determines if bounds checking is enabled or not (on architectures
where this can be controlled - that is, on RDNA chips).

If `cacheSwizzleStride` is provided, L1 cache swizzling will be enabled
on architectures that support it. This swizzling, unlike the main swizzling
mode (whose usage makes a buffer non-raw) does not affect index calculation,
but does affect cache behavior. Mixing access between cache-swizzled raw
buffers and other forms of memory access, like ordinary pointer loads or
unswizzled buffer pointers can cause incorrect behavior and must be avoided.

This operation preserves the sizes, strides, and offset of the input
memref - they'll be added in by `memref.load` later. However, if
`resetOffset` is set, that offset will be added to the base pointer.
If the value of the memref's offset is not uniform (independent of the lane/thread ID),
this will lead to substantially decreased performance due to the need for
a waterfall loop on the base address of the buffer resource.
}];

let extraClassDeclaration = [{
Value getViewSource() { return getSource(); }
}];

let assemblyFormat = [{
$source oilist (`validBytes` `(` $validBytes `)`
| `cacheSwizzleStride` `(` $cacheSwizzleStride `)`
| `boundsCheck` `(` $boundsCheck `)`
| `resetOffset` $resetOffset )
attr-dict `:` type($source) `to` type($result)
}];

let hasVerifier = 1;
}

/// Raw buffer load
def AMDGPU_RawBufferLoadOp :
AMDGPU_Op<"raw_buffer_load", [AllElementTypesMatch<["value", "memref"]>,
Expand Down
2 changes: 2 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@
#include "mlir/IR/BuiltinTypes.h"
#include "mlir/IR/Dialect.h"
#include "mlir/IR/OpDefinition.h"
#include "mlir/Interfaces/InferTypeOpInterface.h"
#include "mlir/Interfaces/SideEffectInterfaces.h"
#include "mlir/Interfaces/ViewLikeInterface.h"

#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h.inc"

Expand Down
3 changes: 3 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@ class ConversionTarget;
namespace amdgpu {

#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
#define GEN_PASS_DECL_AMDGPURESOLVESTRIDEDMETADATAPASS
#define GEN_PASS_REGISTRATION
#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"

void populateAmdgpuEmulateAtomicsPatterns(ConversionTarget &target,
RewritePatternSet &patterns,
Chipset chipset);

void populateAmdgpuResolveStridedMetadataPatterns(RewritePatternSet &patterns);
} // namespace amdgpu
} // namespace mlir

Expand Down
20 changes: 20 additions & 0 deletions mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,24 @@ def AmdgpuEmulateAtomicsPass : Pass<"amdgpu-emulate-atomics"> {
"Chipset that these operations will run on">];
}

def AmdgpuResolveStridedMetadataPass : Pass<"amdgpu-resolve-strided-metadata"> {
let summary = "Resolve memref.extract_strided_metadata on AMDGPU ops";
let description = [{
This pass rrewrites `memref.extract_strided_metadata` operations
targeting the AMDGPU dialect casts.

The patterns in this pass should normally be run alongside those in
-expand-strided-metadata, and creating a pass that combines those two
sets of patterns is the recommended way to use this functionality.
However, this pass (which will likely need a second -expand-strided-metadata
after it) is provided so that simple usecases do not need to create custom passes.
These patterns have not been added to -expnad-strided-metadata to
prevent the memref dialect from depending on platform-specific code.
}];
let dependentDialects = [
"arith::ArithDialect",
"memref::MemRefDialect"
];
}

#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
Loading