Skip to content

Commit 09e94d0

Browse files
authored
Revert "[MLIR][XeGPU] Updates XeGPU TensorDescAttr and Refine Gather/Scatter definition. " (#109666)
Reverts #109144
1 parent ecb98f9 commit 09e94d0

File tree

7 files changed

+168
-359
lines changed

7 files changed

+168
-359
lines changed

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td

Lines changed: 19 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,12 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
1919
let mnemonic = attrMnemonic;
2020
}
2121

22-
class XeGPU_TensorDescAttr<string name, string attrMnemonic, list<Trait> traits = [],
23-
string baseCppClass = "::mlir::Attribute">
24-
: XeGPUAttr<name, attrMnemonic, traits, baseCppClass> {
25-
let assemblyFormat = "`<` struct(params) `>`";
26-
}
27-
28-
def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_tdesc_attr"> {
22+
def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
2923
let summary = [{a composite attribute for `TensorDescType`}];
30-
let description = [{`BlockTensorDesc` (or `block_tdesc_attr`) is a composite
24+
let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite
3125
attribute defined for `TensorDescType` for describing following
3226
properties of a `TensorDesc`.
33-
1. `memory_space`: It describes where the data block described by the
27+
1. `memory_scope`: It describes where the data block described by the
3428
TensorDesc is located, `Global` device memory or `Shared` local memory.
3529
It is default to `Global`.
3630
2. `array_length`: It describes how many horizontally consecutive blocks
@@ -39,63 +33,43 @@ def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_td
3933
8x32. Its default value is 1.
4034
3. `boundary_check`: It is used to indicates the hardware whether to do
4135
out-of-boundary check. The default value is true.
36+
4. `scattered`: It is used to differenciate TensorDescs created from
37+
`create_nd_tdesc` vs from `create_tdesc`.
4238
}];
4339

4440
let parameters = (ins
45-
OptionalParameter<"MemorySpaceAttr">: $memory_space,
41+
OptionalParameter<"MemoryScopeAttr">: $memory_scope,
4642
OptionalParameter<"IntegerAttr", "1">: $array_length,
47-
OptionalParameter<"BoolAttr", "true">: $boundary_check
43+
OptionalParameter<"BoolAttr", "true">: $boundary_check,
44+
OptionalParameter<"BoolAttr", "false">: $scattered
4845
);
4946

5047
let builders = [
5148
AttrBuilder<(ins
52-
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
49+
CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
5350
CArg<"int", "1">:$array_length,
54-
CArg<"bool", "true">: $boundary_check
51+
CArg<"bool", "true">: $boundary_check,
52+
CArg<"bool", "false">: $scattered
5553
)>
5654
];
5755

56+
let assemblyFormat = "`<` struct(params) `>`";
5857
}
5958

60-
def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> {
61-
let summary = [{a composite attribute for `TensorDescType`}];
62-
let description = [{`ScatterTensorDesc` (or `scatter_tdesc_attr`) is a composite
63-
attribute defined for `TensorDescType` for describing following
64-
properties of a `TensorDesc`.
65-
1. `memory_space`: It describes where the data block described by the
66-
TensorDesc is located, `Global` device memory or `Shared` local memory.
67-
It is default to `Global`.
68-
2. `chunk_size`: indicates number of continious elements accessed for each
69-
offset, default is 1. It is used with `scattered` attr only.
70-
}];
71-
72-
let parameters = (ins
73-
OptionalParameter<"MemorySpaceAttr">: $memory_space,
74-
OptionalParameter<"IntegerAttr", "1">: $chunk_size
75-
);
76-
77-
let builders = [
78-
AttrBuilder<(ins
79-
CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
80-
CArg<"int", "1">: $chunk_size
81-
)>
82-
];
83-
}
84-
8559
//===----------------------------------------------------------------------===//
8660
// XeGPU Memory Scope Enums.
8761
//===----------------------------------------------------------------------===//
88-
def XeGPU_MemorySpaceGlobal: I32EnumAttrCase<"Global", 0, "global">;
89-
def XeGPU_MemorySpaceShared: I32EnumAttrCase<"SLM", 3, "slm">;
90-
def XeGPU_MemorySpace: I32EnumAttr<"MemorySpace",
62+
def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
63+
def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
64+
def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
9165
"The address space of the memory the tensor descritor is created for",
92-
[XeGPU_MemorySpaceGlobal, XeGPU_MemorySpaceShared]> {
66+
[XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
9367
let genSpecializedAttr = 0;
9468
let cppNamespace = "::mlir::xegpu";
9569
}
9670

97-
def XeGPU_MemorySpaceAttr:
98-
EnumAttr<XeGPU_Dialect, XeGPU_MemorySpace, "memory_space"> {
71+
def XeGPU_MemoryScopeAttr:
72+
EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
9973
let summary = [{Describe the location of data described by a `TensorDesc`:
10074
Global device memory (`Global`) or Shared local memory (`SLM`).}];
10175
let assemblyFormat = "$value";
@@ -142,4 +116,4 @@ def XeGPU_FenceScopeAttr:
142116
let assemblyFormat = "$value";
143117
}
144118

145-
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
119+
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td

Lines changed: 25 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -218,23 +218,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
218218
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
219219

220220
mlir::Value getViewSource() { return getSource(); }
221-
222-
unsigned getSourceMemorySpace() {
223-
auto srcTy = getSourceType();
224-
if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
225-
auto attr = memrefTy.getMemorySpace();
226-
if (attr) {
227-
if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr)) {
228-
return static_cast<unsigned>(intAttr.getInt());
229-
}
230-
if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
231-
return static_cast<unsigned>(memSpaceAttr.getValue());
232-
}
233-
}
234-
// take global as default memory scope.
235-
return static_cast<unsigned>(MemorySpace::Global);
236-
}
237-
238221
}];
239222
}
240223

@@ -428,10 +411,8 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
428411
is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
429412
implying each element in the array corresponds to a work-item (SIMT lane)
430413
in the subgroup.
431-
432-
The first dimension of the result TensorDesc corresponds to work-items, so it should
433-
match the dimension of offsets. It may also has a second dimension corresponding to
434-
the chunk_size if the chunk size is larger than 1.
414+
* chunk_size: [optional attribute] indicates number of continious
415+
elements accessed for each offset, default is 1.
435416

436417
Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
437418
```mlir
@@ -443,22 +424,29 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
443424
It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
444425
```mlir
445426
%0 = memref.alloc() : memref<1024xf32>
446-
%1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>
427+
%1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
447428
```
448429

449430
Example 3. It is similar to Example 2, but there is some overlaps among workitems.
450431
It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
451432
```mlir
452433
%0 = memref.alloc() : memref<1024xf32>
453-
%1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>>
434+
%1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
454435
```
455436
}];
456437

457438
let arguments = (ins XeGPU_BaseAddrType: $source,
458439
Variadic<Index>: $offsets,
459-
DenseI64ArrayAttr: $const_offsets);
440+
DenseI64ArrayAttr: $const_offsets,
441+
DefaultValuedAttr<I64Attr, "1">: $chunk_size);
460442
let results = (outs XeGPU_TensorDesc:$TensorDesc);
461443

444+
let builders = [
445+
OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
446+
"llvm::ArrayRef<OpFoldResult>": $offsets,
447+
CArg<"uint32_t", "1"> : $chunk_size)>,
448+
];
449+
462450
let assemblyFormat = [{
463451
$source
464452
custom<DynamicIndexList>($offsets, $const_offsets)
@@ -485,22 +473,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
485473
assert(idx < getNumOffsets() && "Invalid out of bound access.");
486474
return getMixedOffsets()[idx];
487475
}
488-
489-
unsigned getSourceMemorySpace() {
490-
auto srcTy = getSource().getType();
491-
if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
492-
auto attr = memrefTy.getMemorySpace();
493-
if (attr) {
494-
if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr))
495-
return static_cast<unsigned>(intAttr.getInt());
496-
if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
497-
return static_cast<unsigned>(memSpaceAttr.getValue());
498-
}
499-
}
500-
// take global as default memory scope.
501-
return static_cast<unsigned>(MemorySpace::Global);
502-
}
503-
504476
}];
505477

506478
let hasVerifier = 1;
@@ -548,31 +520,28 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
548520

549521
let description = [{ It (aka. load) load data per each work-item. The output
550522
describes the data being loaded at the subgroup level, so its size is
551-
consistent with the number of work-items in a subgroup. When the chunk size
552-
is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
553-
to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item.
554-
Specially, there is a transpose effect on the result (as compared to the TensorDesc)
555-
due to the hardware implementation. Therefore, a transpose attribute is introduced
556-
on purpose, making sure users are aware of this implicit transformation.
523+
consistent with the number of work-items in a subgroup. When `chunk_size_per_lane`
524+
attribute is larger than 1 in TensorDesc, the output vector will be 2D vector,
525+
with dim-1 correspoding to the chunk size.
557526

558527
The mask operand masks out memory access so that it is safe to pass out-of-boundary
559528
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
560529

561530
Example:
562531
```mlir
563-
%2 = xegpu.load %1, %0 {transpose,
532+
%2 = xegpu.load %1, %0 {transpose = [1, 0],
564533
l1_hint = #xegpu.cache_hint<cached>,
565534
l2_hint = #xegpu.cache_hint<uncached>,
566535
l3_hint = #xegpu.cache_hint<uncached>}
567-
: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
568-
vector<16xi1> -> vector<16xf32>
536+
: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
537+
-> vector<16xf32>
569538
```
570539

571540
}];
572541

573542
let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
574543
XeGPU_MaskType: $mask,
575-
OptionalAttr<UnitAttr>: $transpose,
544+
OptionalAttr<DenseI64ArrayAttr>: $transpose,
576545
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
577546
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
578547
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -604,15 +573,11 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
604573
let hasVerifier = 1;
605574
}
606575

607-
def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "TensorDesc"]>,
608-
AllElementTypesMatch<["value", "TensorDesc"]>]> {
576+
def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>,
577+
AllElementTypesMatch<["value", "TensorDesc"]>]> {
609578
let summary = "store data to scattered memory locations.";
610-
let description = [{ It (aka. store) stores data to scattered memory locations. The value is
611-
typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
612-
a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
613-
and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter`
614-
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
615-
introduced on purpose, making sure users are aware of this implicit transformation.
579+
let description = [{ It (aka. store) stores data to scattered memory locations.
580+
It has similar semantic to `load_gather`.
616581

617582
Example:
618583
```mlir
@@ -627,7 +592,6 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "T
627592
XeGPU_ValueType: $value,
628593
XeGPU_TensorDesc: $TensorDesc,
629594
XeGPU_MaskType: $mask,
630-
OptionalAttr<UnitAttr>: $transpose,
631595
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
632596
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
633597
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -759,7 +723,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
759723

760724
def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
761725
AllElementTypesMatch<["tensorDesc", "value", "result"]>,
762-
AllShapesMatch<["tensorDesc", "value", "result"]>]> {
726+
AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> {
763727
let summary = "Atomic ready-modify-write operation on the TensorDesc. ";
764728

765729
let description = [{
@@ -844,7 +808,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
844808
2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be
845809
within each workgroup. "GPU" means the scope would be across workgroups within the GPU.
846810
}];
847-
let arguments = (ins XeGPU_MemorySpaceAttr: $memory_kind,
811+
let arguments = (ins XeGPU_MemoryScopeAttr: $memory_kind,
848812
XeGPU_FenceScopeAttr: $fence_scope);
849813
let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}];
850814
let extraClassDeclaration = extraBaseClassDeclaration;

0 commit comments

Comments
 (0)