9
9
#ifndef MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
10
10
#define MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
11
11
12
- include "mlir/IR/AttrTypeBase .td"
12
+ include "mlir/Dialect/Arith/ IR/ArithBase .td"
13
13
include "mlir/Dialect/XeGPU/IR/XeGPUAttrs.td"
14
14
include "mlir/Dialect/XeGPU/IR/XeGPUDialect.td"
15
15
include "mlir/Dialect/XeGPU/IR/XeGPUTypes.td"
@@ -36,7 +36,7 @@ class XeGPU_Op<string mnemonic, list<Trait> traits = []>:
36
36
37
37
static ::mlir::ParseResult parseProperties(::mlir::OpAsmParser &parser,
38
38
::mlir::OperationState &result) {
39
- if (mlir::succeeded(parser.parseLess ())) {
39
+ if (mlir::succeeded(parser.parseOptionalLess ())) {
40
40
if (parser.parseAttribute(result.propertiesAttr) || parser.parseGreater())
41
41
return failure();
42
42
}
@@ -254,7 +254,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor
254
254
a block of data from memory to register. It takes a set of optional cache
255
255
hints for each level of cache, L1, L2 and L3. If hardware does not have a
256
256
correspoding cache, Corresponding cache hint attribute will be masked.
257
- vnni transform is an hardware feature for Intel GPU, which is used to
257
+ VNNI transformation is an hardware feature for Intel GPU, which is used to
258
258
do data packing during the load for B operand of matrix operation, if
259
259
the bit width of the data type is less then 32 bits, e.g., fp16. And
260
260
transpose is another Intel hardware feature, which will do transpose
@@ -425,10 +425,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
425
425
%0 = memref.alloc() : memref<1024xf32>
426
426
%1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
427
427
```
428
-
429
-
430
-
431
-
432
428
}];
433
429
434
430
let arguments = (ins XeGPU_BaseAddrType: $source,
@@ -663,4 +659,153 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
663
659
}];
664
660
}
665
661
662
+ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]> {
663
+ let summary = "It performs mma computation";
664
+
665
+ let description = [{DPAS performs matrix multiplication on matrix A of `mxk`
666
+ size, B of `kxn` size, and accumulate on matrix C of `mxn` to the same size
667
+ matrix , `m=8`, `n=16` and `k=8 * 32/bit_width_of_elem_type`. So for fp16
668
+ data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
669
+ and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
670
+ also requires A and B to be loaded with the required data layout. Specially,
671
+ VNNI layout is required for B operand. It is achieved via setting `vnni_axis = 0`
672
+ of the corresponding `load_nd` operator. To keep both operands as 3D vector,
673
+ operand A is loaded via setting `vnni_axis = 1` without impacting the
674
+ physical layouts change in register. Due to the VNNI transformation, A and B operands
675
+ are represented as 3D vector, with the last dimension representing the VNNI factor,
676
+ which is computed as `32/bit_width_of_elem_type`. Therefore, `A: vector<8x16xf16>`
677
+ is represented as `A: vector<8x8x2xf16>`, and `B: vector<16x16xf16>` is
678
+ represented as `B: vector<8x16x2xf16>`.
679
+
680
+ Note: on PVC, the hardware can perform load with VNNI transformation when data
681
+ element type is 16-bit or lower precision, taking 2 or 4 elements from
682
+ the first dimension and inserted into the newly added innermost dimension.
683
+ }];
684
+
685
+ let arguments = (ins
686
+ XeGPU_DpasOpType : $lhs,
687
+ XeGPU_DpasOpType : $rhs,
688
+ Optional<XeGPU_Vector2DType>: $acc);
689
+ let results = (outs XeGPU_Vector2DType: $result);
690
+
691
+ let extraClassDeclaration = [{
692
+ VectorType getLhsType() {
693
+ return getLhs().getType();
694
+ }
695
+
696
+ VectorType getRhsType() {
697
+ return getRhs().getType();
698
+ }
699
+
700
+ VectorType getAccType() {
701
+ if (getAcc())
702
+ return getAcc().getType();
703
+ return {};
704
+ }
705
+
706
+ VectorType getResultType() {
707
+ return getResult().getType();
708
+ }
709
+ }];
710
+
711
+ let assemblyFormat = [{
712
+ $lhs `,` $rhs (`,` $acc^)? attr-dict `:` type($lhs)`,` type($rhs) (`,` type($acc)^)? `->` type($result)
713
+ }];
714
+
715
+ let hasVerifier = 1;
716
+ }
717
+
718
+ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
719
+ AllElementTypesMatch<["tensorDesc", "value", "result"]>,
720
+ AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> {
721
+ let summary = "Atomic ready-modify-write operation on the TensorDesc. ";
722
+
723
+ let description = [{
724
+ The `xegpu.atomic_rmw` operation provides a way to perform a read-modify-write
725
+ operation on the region described by the `TensorDesc` free from data races. The
726
+ `kind` enumeration specifies the modification to be performed, The `mask` operand
727
+ has the same shape with `TensorDesc`, and is used to enable or disable specific
728
+ data points of the `TensorDesc`. The `value` operand represents the new value to
729
+ be applied during the modification.
730
+ }];
731
+
732
+ let arguments = (ins
733
+ AtomicRMWKindAttr:$kind,
734
+ XeGPU_TensorDesc:$tensorDesc,
735
+ XeGPU_MaskType:$mask,
736
+ XeGPU_ValueType:$value);
737
+
738
+ let results = (outs XeGPU_ValueType:$result);
739
+
740
+ let assemblyFormat = [{
741
+ $kind $tensorDesc `,` $mask `,` $value attr-dict `:`
742
+ type($tensorDesc) `,` type($mask) `,` type($value) `->` type($result)
743
+ }];
744
+ }
745
+
746
+ def XeGPU_AllocNbarrierOp: XeGPU_Op<"alloc_nbarrier", []> {
747
+ let summary = "It allocates a set of named barriers.";
748
+ let description = [{AllocNbarrier is to create a set of named barriers as
749
+ specified by `nbarrier_num`. Named barriers are workgroup level resources,
750
+ and are shared by all threads in the workgroup. For example, there are
751
+ up to 32 barriers (range 0-31) for each XeCore on PVC. A typical use case
752
+ is that a workgroup is partitioned into N subgroups of threads (N <= 32),
753
+ and each subgroup coordinating their work with a separate barrier with id
754
+ range from 0 to N respectively.}];
755
+ let arguments = (ins I64Attr: $nbarrier_num);
756
+ let assemblyFormat = "$nbarrier_num attr-dict";
757
+ }
758
+
759
+ def XeGPU_InitNbarrierOp: XeGPU_Op<"init_nbarrier", []> {
760
+ let summary = "It assigns a named barrier to the current thread.";
761
+ let description = [{InitNbarrierOp assigns the named barrier with the specified
762
+ barrier ID (0~31) to the current thread. Multiple threads may bind to the
763
+ same named barrier, and the `participant_thread_num` specifies the total
764
+ number of threads associated with the nbarrier. It returns an object of
765
+ NbarrierType representing the barrier}];
766
+
767
+ let arguments = (ins I8: $nbarrier_id,
768
+ I8: $participant_thread_num);
769
+ let results = (outs XeGPU_Nbarrier: $result);
770
+ let assemblyFormat = [{
771
+ $nbarrier_id `,` $participant_thread_num attr-dict `:`
772
+ type($nbarrier_id) `,` type($participant_thread_num) `->` qualified(type($result))
773
+ }];
774
+ }
775
+
776
+ def XeGPU_NbarrierArriveOp: XeGPU_Op<"nbarrier_arrive", []> {
777
+ let summary = "It signals the arrival at the named barrier.";
778
+ let description = [{NbarrierArriveOp signals the hardware (or other threads)
779
+ that the current thread has produced its data for the consumer threads. When
780
+ the hardware signalled by `participant_thread_num` threads for the named barrier,
781
+ it will notify the threads waiting for the named barrier to continue their work.}];
782
+
783
+ let arguments = (ins XeGPU_Nbarrier: $nbarrier);
784
+ let assemblyFormat = [{ $nbarrier attr-dict `:` qualified(type($nbarrier))}];
785
+ }
786
+
787
+ def XeGPU_NbarrierWaitOp: XeGPU_Op<"nbarrier_wait", []> {
788
+ let summary = "It waits for a named barrier.";
789
+ let description = [{NbarrierWaitOp signals the hardware which named barrier
790
+ the current thread is waiting for, such that it can get notified when the
791
+ named barrier is completed.}];
792
+ let arguments = (ins XeGPU_Nbarrier: $nbarrier);
793
+ let assemblyFormat = [{ $nbarrier attr-dict `:` qualified(type($nbarrier)) }];
794
+ }
795
+
796
+ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
797
+ let summary = "It synchronizes memory accesses.";
798
+ let description = [{It synchronizes the memory access between
799
+ write and following read or write.
800
+ 1. `Memory_kind` describes the memory kind. "global" means the global memory,
801
+ "slm" means the share local memory.
802
+ 2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be
803
+ within each workgroup. "GPU" means the scope would be across workgroups within the GPU.
804
+ }];
805
+ let arguments = (ins XeGPU_MemoryScopeAttr: $memory_kind,
806
+ XeGPU_FenceScopeAttr: $fence_scope);
807
+ let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}];
808
+ let extraClassDeclaration = extraBaseClassDeclaration;
809
+ }
810
+
666
811
#endif // MLIR_DIALECT_XEGPU_IR_XEGPUOPS_TD
0 commit comments