Skip to content

[MLIR][XeGPU] Updates XeGPU TensorDescAttr and Refine Gather/Scatter definition. #109144

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 45 additions & 19 deletions mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,18 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
let mnemonic = attrMnemonic;
}

def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
class XeGPU_TensorDescAttr<string name, string attrMnemonic, list<Trait> traits = [],
string baseCppClass = "::mlir::Attribute">
: XeGPUAttr<name, attrMnemonic, traits, baseCppClass> {
let assemblyFormat = "`<` struct(params) `>`";
}

def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_tdesc_attr"> {
let summary = [{a composite attribute for `TensorDescType`}];
let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite
let description = [{`BlockTensorDesc` (or `block_tdesc_attr`) is a composite
attribute defined for `TensorDescType` for describing following
properties of a `TensorDesc`.
1. `memory_scope`: It describes where the data block described by the
1. `memory_space`: It describes where the data block described by the
TensorDesc is located, `Global` device memory or `Shared` local memory.
It is default to `Global`.
2. `array_length`: It describes how many horizontally consecutive blocks
Expand All @@ -33,43 +39,63 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
8x32. Its default value is 1.
3. `boundary_check`: It is used to indicates the hardware whether to do
out-of-boundary check. The default value is true.
4. `scattered`: It is used to differenciate TensorDescs created from
`create_nd_tdesc` vs from `create_tdesc`.
}];

let parameters = (ins
OptionalParameter<"MemoryScopeAttr">: $memory_scope,
OptionalParameter<"MemorySpaceAttr">: $memory_space,
OptionalParameter<"IntegerAttr", "1">: $array_length,
OptionalParameter<"BoolAttr", "true">: $boundary_check,
OptionalParameter<"BoolAttr", "false">: $scattered
OptionalParameter<"BoolAttr", "true">: $boundary_check
);

let builders = [
AttrBuilder<(ins
CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
CArg<"int", "1">:$array_length,
CArg<"bool", "true">: $boundary_check,
CArg<"bool", "false">: $scattered
CArg<"bool", "true">: $boundary_check
)>
];

let assemblyFormat = "`<` struct(params) `>`";
}

def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> {
let summary = [{a composite attribute for `TensorDescType`}];
let description = [{`ScatterTensorDesc` (or `scatter_tdesc_attr`) is a composite
attribute defined for `TensorDescType` for describing following
properties of a `TensorDesc`.
1. `memory_space`: It describes where the data block described by the
TensorDesc is located, `Global` device memory or `Shared` local memory.
It is default to `Global`.
2. `chunk_size`: indicates number of continious elements accessed for each
offset, default is 1. It is used with `scattered` attr only.
}];

let parameters = (ins
OptionalParameter<"MemorySpaceAttr">: $memory_space,
OptionalParameter<"IntegerAttr", "1">: $chunk_size
);

let builders = [
AttrBuilder<(ins
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
CArg<"int", "1">: $chunk_size
)>
];
}

//===----------------------------------------------------------------------===//
// XeGPU Memory Scope Enums.
//===----------------------------------------------------------------------===//
def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
def XeGPU_MemorySpaceGlobal: I32EnumAttrCase<"Global", 0, "global">;
def XeGPU_MemorySpaceShared: I32EnumAttrCase<"SLM", 3, "slm">;
def XeGPU_MemorySpace: I32EnumAttr<"MemorySpace",
"The address space of the memory the tensor descritor is created for",
[XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
[XeGPU_MemorySpaceGlobal, XeGPU_MemorySpaceShared]> {
let genSpecializedAttr = 0;
let cppNamespace = "::mlir::xegpu";
}

def XeGPU_MemoryScopeAttr:
EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
def XeGPU_MemorySpaceAttr:
EnumAttr<XeGPU_Dialect, XeGPU_MemorySpace, "memory_space"> {
let summary = [{Describe the location of data described by a `TensorDesc`:
Global device memory (`Global`) or Shared local memory (`SLM`).}];
let assemblyFormat = "$value";
Expand Down Expand Up @@ -116,4 +142,4 @@ def XeGPU_FenceScopeAttr:
let assemblyFormat = "$value";
}

#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
86 changes: 61 additions & 25 deletions mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,23 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }

mlir::Value getViewSource() { return getSource(); }

unsigned getSourceMemorySpace() {
auto srcTy = getSourceType();
if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
auto attr = memrefTy.getMemorySpace();
if (attr) {
if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr)) {
return static_cast<unsigned>(intAttr.getInt());
}
if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
return static_cast<unsigned>(memSpaceAttr.getValue());
}
}
// take global as default memory scope.
return static_cast<unsigned>(MemorySpace::Global);
}

}];
}

Expand Down Expand Up @@ -411,8 +428,10 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
implying each element in the array corresponds to a work-item (SIMT lane)
in the subgroup.
* chunk_size: [optional attribute] indicates number of continious
elements accessed for each offset, default is 1.

The first dimension of the result TensorDesc corresponds to work-items, so it should
match the dimension of offsets. It may also has a second dimension corresponding to
the chunk_size if the chunk size is larger than 1.

Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
```mlir
Expand All @@ -424,29 +443,22 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
```mlir
%0 = memref.alloc() : memref<1024xf32>
%1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
%1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>
```

Example 3. It is similar to Example 2, but there is some overlaps among workitems.
It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
```mlir
%0 = memref.alloc() : memref<1024xf32>
%1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
%1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>>
```
}];

let arguments = (ins XeGPU_BaseAddrType: $source,
Variadic<Index>: $offsets,
DenseI64ArrayAttr: $const_offsets,
DefaultValuedAttr<I64Attr, "1">: $chunk_size);
DenseI64ArrayAttr: $const_offsets);
let results = (outs XeGPU_TensorDesc:$TensorDesc);

let builders = [
OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
"llvm::ArrayRef<OpFoldResult>": $offsets,
CArg<"uint32_t", "1"> : $chunk_size)>,
];

let assemblyFormat = [{
$source
custom<DynamicIndexList>($offsets, $const_offsets)
Expand All @@ -473,6 +485,22 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
assert(idx < getNumOffsets() && "Invalid out of bound access.");
return getMixedOffsets()[idx];
}

unsigned getSourceMemorySpace() {
auto srcTy = getSource().getType();
if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
auto attr = memrefTy.getMemorySpace();
if (attr) {
if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr))
return static_cast<unsigned>(intAttr.getInt());
if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
return static_cast<unsigned>(memSpaceAttr.getValue());
}
}
// take global as default memory scope.
return static_cast<unsigned>(MemorySpace::Global);
}

}];

let hasVerifier = 1;
Expand Down Expand Up @@ -520,28 +548,31 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]

let description = [{ It (aka. load) load data per each work-item. The output
describes the data being loaded at the subgroup level, so its size is
consistent with the number of work-items in a subgroup. When `chunk_size_per_lane`
attribute is larger than 1 in TensorDesc, the output vector will be 2D vector,
with dim-1 correspoding to the chunk size.
consistent with the number of work-items in a subgroup. When the chunk size
is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item.
Specially, there is a transpose effect on the result (as compared to the TensorDesc)
due to the hardware implementation. Therefore, a transpose attribute is introduced
on purpose, making sure users are aware of this implicit transformation.

The mask operand masks out memory access so that it is safe to pass out-of-boundary
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.

Example:
```mlir
%2 = xegpu.load %1, %0 {transpose = [1, 0],
%2 = xegpu.load %1, %0 {transpose,
l1_hint = #xegpu.cache_hint<cached>,
l2_hint = #xegpu.cache_hint<uncached>,
l3_hint = #xegpu.cache_hint<uncached>}
: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
-> vector<16xf32>
: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
vector<16xi1> -> vector<16xf32>
```

}];

let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
XeGPU_MaskType: $mask,
OptionalAttr<DenseI64ArrayAttr>: $transpose,
OptionalAttr<UnitAttr>: $transpose,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
Expand Down Expand Up @@ -573,11 +604,15 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
let hasVerifier = 1;
}

def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>,
AllElementTypesMatch<["value", "TensorDesc"]>]> {
def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "TensorDesc"]>,
AllElementTypesMatch<["value", "TensorDesc"]>]> {
let summary = "store data to scattered memory locations.";
let description = [{ It (aka. store) stores data to scattered memory locations.
It has similar semantic to `load_gather`.
let description = [{ It (aka. store) stores data to scattered memory locations. The value is
typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter`
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
introduced on purpose, making sure users are aware of this implicit transformation.

Example:
```mlir
Expand All @@ -592,6 +627,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDe
XeGPU_ValueType: $value,
XeGPU_TensorDesc: $TensorDesc,
XeGPU_MaskType: $mask,
OptionalAttr<UnitAttr>: $transpose,
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
Expand Down Expand Up @@ -723,7 +759,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]

def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
AllElementTypesMatch<["tensorDesc", "value", "result"]>,
AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> {
AllShapesMatch<["tensorDesc", "value", "result"]>]> {
let summary = "Atomic ready-modify-write operation on the TensorDesc. ";

let description = [{
Expand Down Expand Up @@ -808,7 +844,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be
within each workgroup. "GPU" means the scope would be across workgroups within the GPU.
}];
let arguments = (ins XeGPU_MemoryScopeAttr: $memory_kind,
let arguments = (ins XeGPU_MemorySpaceAttr: $memory_kind,
XeGPU_FenceScopeAttr: $fence_scope);
let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}];
let extraClassDeclaration = extraBaseClassDeclaration;
Expand Down
Loading
Loading