Skip to content

Commit 03bb10d

Browse files
chencha3joker-ephadam-smnk
authored
[MLIR][XeGPU] Add dpas, atomic, and named barrier ops (#88973)
--------- Co-authored-by: Mehdi Amini <[email protected]> Co-authored-by: Adam Siemieniuk <[email protected]>
1 parent 7911615 commit 03bb10d

File tree

9 files changed

+296
-14
lines changed

9 files changed

+296
-14
lines changed

mlir/include/mlir/Dialect/XeGPU/IR/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@ add_mlir_dialect(XeGPU xegpu)
22
add_mlir_doc(XeGPU XeGPU Dialects/ -gen-dialect-doc -dialect=xegpu)
33

44
set(LLVM_TARGET_DEFINITIONS XeGPU.td)
5-
mlir_tablegen(XeGPUAttrs.h.inc -gen-attrdef-decls)
6-
mlir_tablegen(XeGPUAttrs.cpp.inc -gen-attrdef-defs)
5+
mlir_tablegen(XeGPUAttrs.h.inc -gen-attrdef-decls -attrdefs-dialect=xegpu)
6+
mlir_tablegen(XeGPUAttrs.cpp.inc -gen-attrdef-defs -attrdefs-dialect=xegpu)
77
add_public_tablegen_target(MLIRXeGPUAttrsIncGen)
88
add_dependencies(mlir-headers MLIRXeGPUAttrsIncGen)
99

10-
set(LLVM_TARGET_DEFINITIONS XeGPU.td)
10+
set(LLVM_TARGET_DEFINITIONS XeGPUAttrs.td)
1111
mlir_tablegen(XeGPUEnums.h.inc -gen-enum-decls)
1212
mlir_tablegen(XeGPUEnums.cpp.inc -gen-enum-defs)
1313
add_public_tablegen_target(MLIRXeGPUEnumsIncGen)

mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define MLIR_DIALECT_XEGPU_IR_XEGPU_H
1111

1212
#include "mlir/Bytecode/BytecodeOpInterface.h"
13+
#include "mlir/Dialect/Arith/IR/Arith.h"
1314
#include "mlir/IR/BuiltinTypes.h"
1415
#include "mlir/IR/Dialect.h"
1516
#include "mlir/IR/TypeUtilities.h"
@@ -19,7 +20,7 @@
1920

2021
namespace mlir {
2122
namespace xegpu {
22-
// placeholder
23+
class TensorDescType;
2324
} // namespace xegpu
2425
} // namespace mlir
2526

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
1111

1212
include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
13+
include "mlir/IR/AttrTypeBase.td"
1314
include "mlir/IR/EnumAttr.td"
1415

1516
class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
@@ -98,4 +99,21 @@ def XeGPU_CacheHintAttr
9899
let assemblyFormat = "`<` $value `>`";
99100
}
100101

102+
def XeGPU_FenceScopeWorkgroup: I32EnumAttrCase<"Workgroup", 0, "workgroup">;
103+
def XeGPU_FenceScopeGPU: I32EnumAttrCase<"GPU", 1, "gpu">;
104+
def XeGPU_FenceScope: I32EnumAttr<"FenceScope",
105+
"The enumeration for the scope of fence operation.",
106+
[XeGPU_FenceScopeWorkgroup, XeGPU_FenceScopeGPU]> {
107+
let genSpecializedAttr = 0;
108+
let cppNamespace = "::mlir::xegpu";
109+
}
110+
111+
def XeGPU_FenceScopeAttr:
112+
EnumAttr<XeGPU_Dialect, XeGPU_FenceScope, "fence_scope"> {
113+
let summary = [{Describes the scope of fence.
114+
"workgroup" means that the scope is within each work group.
115+
"gpu" means the scope is across work groups within the gpu.}];
116+
let assemblyFormat = "$value";
117+
}
118+
101119
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUDialect.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,14 @@ def XeGPU_Dialect : Dialect {
1717
let summary = "The XeGPU dialect that models Intel GPU's ISA";
1818
let description = [{
1919
The XeGPU dialect models Intel Xe ISA semantics but works at vector and
20-
TensorDesc data type. It provides 1:1 mappings to match Xe instructions
20+
TensorDesc data type. It provides 1:1 mappings to match Xe instructions
2121
like DPAS and 2D block load. The matrix size being processed at this level
2222
exactly matches the hardware instructions or the intrinsic supported by
2323
the lower-level GPU compiler.
2424
}];
2525

26+
let dependentDialects = ["arith::ArithDialect"];
27+
2628
let useDefaultTypePrinterParser = true;
2729
let useDefaultAttributePrinterParser = true;
2830
}

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td

Lines changed: 152 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
1010
#define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
1111

12-
include "mlir/IR/AttrTypeBase.td"
12+
include "mlir/Dialect/Arith/IR/ArithBase.td"
1313
include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
1414
include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
1515
include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
@@ -36,7 +36,7 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
3636

3737
static ::mlir::ParseResult parseProperties(::mlir::OpAsmParser &parser,
3838
::mlir::OperationState &result) {
39-
if (mlir::succeeded(parser.parseLess())) {
39+
if (mlir::succeeded(parser.parseOptionalLess())) {
4040
if (parser.parseAttribute(result.propertiesAttr) || parser.parseGreater())
4141
return failure();
4242
}
@@ -254,7 +254,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor
254254
a block of data from memory to register. It takes a set of optional cache
255255
hints for each level of cache, L1, L2 and L3. If hardware does not have a
256256
correspoding cache, Corresponding cache hint attribute will be masked.
257-
vnni transform is an hardware feature for Intel GPU, which is used to
257+
VNNI transformation is an hardware feature for Intel GPU, which is used to
258258
do data packing during the load for B operand of matrix operation, if
259259
the bit width of the data type is less then 32 bits, e.g., fp16. And
260260
transpose is another Intel hardware feature, which will do transpose
@@ -425,10 +425,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
425425
%0 = memref.alloc() : memref<1024xf32>
426426
%1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
427427
```
428-
429-
430-
431-
432428
}];
433429

434430
let arguments = (ins XeGPU_BaseAddrType: $source,
@@ -663,4 +659,153 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
663659
}];
664660
}
665661

662+
def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]> {
663+
let summary = "It performs mma computation";
664+
665+
let description = [{DPAS performs matrix multiplication on matrix A of `mxk`
666+
size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size
667+
matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16
668+
data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
669+
and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
670+
also requires A and B to be loaded with the required data layout. Specially,
671+
VNNI layout is required for B operand. It is achieved via setting `vnni_axis = 0`
672+
of the corresponding `load_nd` operator. To keep both operands as 3D vector,
673+
operand A is loaded via setting `vnni_axis = 1` without impacting the
674+
physical layouts change in register. Due to the VNNI transformation, A and B operands
675+
are represented as 3D vector, with the last dimension representing the VNNI factor,
676+
which is computed as `32/bit_width_of_elem_type`. Therefore, `A: vector<8x16xf16>`
677+
is represented as `A: vector<8x8x2xf16>`, and `B: vector<16x16xf16>` is
678+
represented as `B: vector<8x16x2xf16>`.
679+
680+
Note: on PVC, the hardware can perform load with VNNI transformation when data
681+
element type is 16-bit or lower precision, taking 2 or 4 elements from
682+
the first dimension and inserted into the newly added innermost dimension.
683+
}];
684+
685+
let arguments = (ins
686+
XeGPU_DpasOpType : $lhs,
687+
XeGPU_DpasOpType : $rhs,
688+
Optional<XeGPU_Vector2DType>: $acc);
689+
let results = (outs XeGPU_Vector2DType: $result);
690+
691+
let extraClassDeclaration = [{
692+
VectorType getLhsType() {
693+
return getLhs().getType();
694+
}
695+
696+
VectorType getRhsType() {
697+
return getRhs().getType();
698+
}
699+
700+
VectorType getAccType() {
701+
if (getAcc())
702+
return getAcc().getType();
703+
return {};
704+
}
705+
706+
VectorType getResultType() {
707+
return getResult().getType();
708+
}
709+
}];
710+
711+
let assemblyFormat = [{
712+
$lhs `,` $rhs (`,` $acc^)? attr-dict `:` type($lhs)`,` type($rhs) (`,` type($acc)^)? `->` type($result)
713+
}];
714+
715+
let hasVerifier = 1;
716+
}
717+
718+
def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
719+
AllElementTypesMatch<["tensorDesc", "value", "result"]>,
720+
AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> {
721+
let summary = "Atomic ready-modify-write operation on the TensorDesc. ";
722+
723+
let description = [{
724+
The `xegpu.atomic_rmw` operation provides a way to perform a read-modify-write
725+
operation on the region described by the `TensorDesc` free from data races. The
726+
`kind` enumeration specifies the modification to be performed, The `mask` operand
727+
has the same shape with `TensorDesc`, and is used to enable or disable specific
728+
data points of the `TensorDesc`. The `value` operand represents the new value to
729+
be applied during the modification.
730+
}];
731+
732+
let arguments = (ins
733+
AtomicRMWKindAttr:$kind,
734+
XeGPU_TensorDesc:$tensorDesc,
735+
XeGPU_MaskType:$mask,
736+
XeGPU_ValueType:$value);
737+
738+
let results = (outs XeGPU_ValueType:$result);
739+
740+
let assemblyFormat = [{
741+
$kind $tensorDesc `,` $mask `,` $value attr-dict `:`
742+
type($tensorDesc) `,` type($mask) `,` type($value) `->` type($result)
743+
}];
744+
}
745+
746+
def XeGPU_AllocNbarrierOp: XeGPU_Op<"alloc_nbarrier", []> {
747+
let summary = "It allocates a set of named barriers.";
748+
let description = [{AllocNbarrier is to create a set of named barriers as
749+
specified by `nbarrier_num`. Named barriers are workgroup level resources,
750+
and are shared by all threads in the workgroup. For example, there are
751+
up to 32 barriers (range 0-31) for each XeCore on PVC. A typical use case
752+
is that a workgroup is partitioned into N subgroups of threads (N <= 32),
753+
and each subgroup coordinating their work with a separate barrier with id
754+
range from 0 to N respectively.}];
755+
let arguments = (ins I64Attr: $nbarrier_num);
756+
let assemblyFormat = "$nbarrier_num attr-dict";
757+
}
758+
759+
def XeGPU_InitNbarrierOp: XeGPU_Op<"init_nbarrier", []> {
760+
let summary = "It assigns a named barrier to the current thread.";
761+
let description = [{InitNbarrierOp assigns the named barrier with the specified
762+
barrier ID (0~31) to the current thread. Multiple threads may bind to the
763+
same named barrier, and the `participant_thread_num` specifies the total
764+
number of threads associated with the nbarrier. It returns an object of
765+
NbarrierType representing the barrier}];
766+
767+
let arguments = (ins I8: $nbarrier_id,
768+
I8: $participant_thread_num);
769+
let results = (outs XeGPU_Nbarrier: $result);
770+
let assemblyFormat = [{
771+
$nbarrier_id `,` $participant_thread_num attr-dict `:`
772+
type($nbarrier_id) `,` type($participant_thread_num) `->` qualified(type($result))
773+
}];
774+
}
775+
776+
def XeGPU_NbarrierArriveOp: XeGPU_Op<"nbarrier_arrive", []> {
777+
let summary = "It signals the arrival at the named barrier.";
778+
let description = [{NbarrierArriveOp signals the hardware (or other threads)
779+
that the current thread has produced its data for the consumer threads. When
780+
the hardware signalled by `participant_thread_num` threads for the named barrier,
781+
it will notify the threads waiting for the named barrier to continue their work.}];
782+
783+
let arguments = (ins XeGPU_Nbarrier: $nbarrier);
784+
let assemblyFormat = [{ $nbarrier attr-dict `:` qualified(type($nbarrier))}];
785+
}
786+
787+
def XeGPU_NbarrierWaitOp: XeGPU_Op<"nbarrier_wait", []> {
788+
let summary = "It waits for a named barrier.";
789+
let description = [{NbarrierWaitOp signals the hardware which named barrier
790+
the current thread is waiting for, such that it can get notified when the
791+
named barrier is completed.}];
792+
let arguments = (ins XeGPU_Nbarrier: $nbarrier);
793+
let assemblyFormat = [{ $nbarrier attr-dict `:` qualified(type($nbarrier)) }];
794+
}
795+
796+
def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
797+
let summary = "It synchronizes memory accesses.";
798+
let description = [{It synchronizes the memory access between
799+
write and following read or write.
800+
1. `Memory_kind` describes the memory kind. "global" means the global memory,
801+
"slm" means the share local memory.
802+
2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be
803+
within each workgroup. "GPU" means the scope would be across workgroups within the GPU.
804+
}];
805+
let arguments = (ins XeGPU_MemoryScopeAttr: $memory_kind,
806+
XeGPU_FenceScopeAttr: $fence_scope);
807+
let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}];
808+
let extraClassDeclaration = extraBaseClassDeclaration;
809+
}
810+
666811
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD

mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,4 +151,15 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
151151

152152
}
153153

154+
155+
def XeGPU_Nbarrier: XeGPUTypeDef<"Nbarrier", "nbarrier", [], "mlir::Type"> {
156+
let summary = "!xegpu.nbarrier a custom XeGPU type representing a barrier.";
157+
158+
let extraClassDeclaration = [{
159+
static NbarrierType get(mlir::MLIRContext *context) {
160+
return Base::get(context);
161+
};
162+
}];
163+
}
164+
154165
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUTYPES_TD

mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,28 @@ LogicalResult StoreScatterOp::verify() {
406406

407407
return success();
408408
}
409+
//===----------------------------------------------------------------------===//
410+
// XeGPU_DpasOp
411+
//===----------------------------------------------------------------------===//
412+
LogicalResult DpasOp::verify() {
413+
int64_t lhsRank = getLhsType().getRank();
414+
int64_t rhsRank = getRhsType().getRank();
415+
416+
if (lhsRank != rhsRank || lhsRank != 3)
417+
return emitOpError(
418+
"lhs and rhs rank does not match for dpas op, or their rank is not 3.");
419+
420+
if (getAcc() && getAccType() != getResultType())
421+
return emitOpError("Accumulator and Result for dpas op should have the "
422+
"same type (both shape and element type).");
423+
424+
auto lhsShape = getLhsType().getShape();
425+
auto rhsShape = getRhsType().getShape();
426+
if (lhsShape[1] != rhsShape[0] || lhsShape[2] != rhsShape[2])
427+
return emitOpError("K-dimension or vnni-factor mismatch.");
428+
429+
return success();
430+
}
409431

410432
} // namespace xegpu
411433
} // namespace mlir

mlir/test/Dialect/XeGPU/XeGPUOps.mlir

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ gpu.func @test_prefetch_vc(%src: ui64) {
8080
//CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
8181
%1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
8282
// CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
83-
xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
83+
xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
8484
gpu.return
8585
}
8686

@@ -121,4 +121,59 @@ gpu.func @test_create_update_tdesc_vc(%src: ui64) {
121121
gpu.return
122122
}
123123

124-
}
124+
// CHECK: gpu.func @test_dpas_vc(%[[arg0:.*]]: vector<8x8x2xf16>, %[[arg1:.*]]: vector<8x16x2xf16>)
125+
gpu.func @test_dpas_vc(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
126+
// CHECK: %0 = xegpu.dpas %[[arg0]], %[[arg1]] : vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
127+
%1 = xegpu.dpas %a, %b: vector<8x8x2xf16>, vector<8x16x2xf16> -> vector<8x16xf32>
128+
gpu.return
129+
}
130+
131+
// CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
132+
gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
133+
//CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
134+
%1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]: ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
135+
//CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : <16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
136+
xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
137+
gpu.return
138+
}
139+
140+
// CHECK: gpu.func @alloc_nbarrier({{.*}}) {
141+
gpu.func @alloc_nbarrier() {
142+
// CHECK: xegpu.alloc_nbarrier
143+
xegpu.alloc_nbarrier 8
144+
gpu.return
145+
}
146+
147+
// CHECK: gpu.func @init_nbarrier({{.*}}) {
148+
gpu.func @init_nbarrier() {
149+
//CHECK: %[[c1:.*]] = arith.constant 1 : i8
150+
//CHECK: %[[c16:.*]] = arith.constant 16 : i8
151+
%nbarrier_id = arith.constant 1 : i8
152+
%threads_count = arith.constant 16 : i8
153+
//CHECK: xegpu.init_nbarrier %[[c1]], %[[c16]] : i8, i8 -> !xegpu.nbarrier
154+
%nbarrier = xegpu.init_nbarrier %nbarrier_id, %threads_count : i8, i8 -> !xegpu.nbarrier
155+
gpu.return
156+
}
157+
158+
// CHECK: gpu.func @nbarrier_arrive(%[[arg0:.*]]: !xegpu.nbarrier) {
159+
gpu.func @nbarrier_arrive(%nbarrier : !xegpu.nbarrier) {
160+
//CHECK: xegpu.nbarrier_arrive %[[arg0]] : !xegpu.nbarrier
161+
xegpu.nbarrier_arrive %nbarrier : !xegpu.nbarrier
162+
gpu.return
163+
}
164+
165+
// CHECK: gpu.func @nbarrier_wait(%[[arg0:.*]]: !xegpu.nbarrier) {
166+
gpu.func @nbarrier_wait(%nbarrier : !xegpu.nbarrier) {
167+
//CHECK: xegpu.nbarrier_wait %[[arg0]] : !xegpu.nbarrier
168+
xegpu.nbarrier_wait %nbarrier : !xegpu.nbarrier
169+
gpu.return
170+
}
171+
172+
// CHECK-LABEL: gpu.func @fence({{.*}}) {
173+
gpu.func @fence() {
174+
//CHECK: xegpu.fence memory_kind = global, fence_scope = workgroup
175+
xegpu.fence memory_kind = global, fence_scope = workgroup
176+
gpu.return
177+
}
178+
179+
}

0 commit comments

Comments
 (0)