@@ -53,47 +53,56 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
53
53
let summary = "Create nd-tensor descriptor operation";
54
54
let description = [{
55
55
The "create_nd_tdesc" operation creates a TensorDescType which represents
56
- a sub-view of a 2D memory region (It can be extended to support n-D memory
57
- region if needed in future). Elements in the subview continuous in each
58
- dimension. It encodes the following important information for supporting
59
- Intel hardware features:
60
-
61
- * source: an object representing (starting address/pointer of) a 2D memory region.
62
- It can be either a 2D memref object, or simply a pointer represented by uint64_t type.
63
- for the later case, the shape and layout information of the 2D memory region should
64
- be explicitly passed via `shape` and `strides` parameters.
65
- * offsets: two index values represents offsets from the "source" at the each dimension
66
- at which the subview of the target memory will be created. It is encoded via two
67
- variables, including "offsets" and "const_offsets", such that it can
68
- accept various forms, such as, operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]).
69
- * shape: the shape information of the memory region pointed by the "source". It is
70
- typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
56
+ a sub-view of a 1D/2D memory region inside the one or two innermost dimensions
57
+ of the source. (It can be extended to support n-D memory region if needed in
58
+ future). Elements in the subview continuous in each dimension. It encodes the
59
+ following important information for supporting Intel hardware features:
60
+
61
+ * source: an object representing (starting address/pointer of) a memory region.
62
+ It can be either a memref object, or simply a pointer represented by uint64_t type.
63
+ For the case of dynamic memrefs or pointer, the shape and layout information of the
64
+ memory region should be explicitly passed via `shape` and `strides` parameters.
65
+
66
+ * offsets: index values represents offsets from the "source" at the each dimension
67
+ at which the subview of the target memory will be created. It is encoded via
68
+ "offsets" and "const_offsets", such that it can accept various forms, such as,
69
+ operands (e.g., [%c0, %c]) and attributes (e.g., [2, 4]).
70
+
71
+ * shape: the shape information of the memory region pointed by the "source". It is
72
+ typically encoded via the MemRefType of the source, e.g., memref<4096x4096xf16>.
71
73
But if "source" is simply a pointer represented as uint64_t type, or a memref
72
74
type without shape information e.g., memref<?x?xf16>, the shape information has
73
75
to be explicitly passed via the "shape" and "const_shape" arguments.
76
+
74
77
* strides: the strides of the memory region pointed by the "source". Similar to shape,
75
78
it is typically encoded via the MemRefType of the source too. But if "source" is
76
79
simply a pointer represented as uint64_t type, or a memref type without shape
77
80
information e.g., memref<?x?xf16>, the strides information has to be explicitly
78
81
passed via the "strides" and "const_strides" argument.
79
82
80
83
Example 1 (suppose the tensor shape inferred by the compiler is 8x16):
84
+ ```mlir
81
85
%0 = memref.alloc() : memref<1024x1024xf32>
82
86
%c0 = arith.constant 0 : index
83
87
%c1 = arith.constant 1 : index
84
88
%1 = xegpu.create_nd_tdesc %0[%c0, %c0]: memref<1024x1024xf32> -> TensorDesc<8x16xf32>
89
+ ```
85
90
86
91
Example 2 (suppose the tensor shape inferred by the compiler is 8x16):
92
+ ```mlir
87
93
%0 = memref.alloc(%h, %w) : memref<?x?xf32>
88
94
%c0 = arith.constant 0 : index
89
95
%c1 = arith.constant 1 : index
90
96
%1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: memref<?x?xf32> -> TensorDesc<8x16xf32>
97
+ ```
91
98
92
99
Example 3 (suppose the tensor shape inferred by the compiler is 8x16):
100
+ ```mlir
93
101
%0 = ... : ui64
94
102
%c0 = arith.constant 0 : index
95
103
%c1 = arith.constant 1 : index
96
104
%1 = xegpu.create_nd_tdesc %0[%c0, %c0], [%h, %w], [%w, %c1]: ui64 -> TensorDesc<8x16xf32>
105
+ ```
97
106
}];
98
107
99
108
let arguments = (ins
@@ -219,7 +228,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
219
228
memory regions to each level of the cache based on their cache policy.
220
229
221
230
Example:
222
- ```
231
+ ```mlir
223
232
xegpu.prefetch_nd %tdesc {l1_hint = #xegpu.cache_hint<cached>,
224
233
l2_hint = #xegpu.cache_hint<cached>,
225
234
l3_hint = #xegpu.cache_hint<cached>}
@@ -245,8 +254,7 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
245
254
}
246
255
247
256
248
- def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>,
249
- AllElementCountsMatch<["value", "TensorDesc"]>]> {
257
+ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
250
258
let summary = "loads a n-D block from memory (represented by TensorDesc)"
251
259
"to registers (represented by vector)";
252
260
let description = [{
@@ -263,7 +271,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor
263
271
same time.
264
272
265
273
Example:
266
- ```
274
+ ```mlir
267
275
xegpu.load_nd %1 {transpose = [1, 0],
268
276
l1_hint = #xegpu.cache_hint<cached>,
269
277
l2_hint = #xegpu.cache_hint<uncached>,
@@ -275,7 +283,7 @@ def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "Tensor
275
283
}];
276
284
277
285
let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
278
- OptionalAttr<I64Attr >: $vnni_axis ,
286
+ OptionalAttr<UnitAttr >: $packed ,
279
287
OptionalAttr<DenseI64ArrayAttr>: $transpose,
280
288
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
281
289
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
@@ -309,7 +317,7 @@ def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllShapesMatch<["value", "TensorDesc
309
317
Corresponding cache hint attribute will be masked.
310
318
311
319
Example:
312
- ```
320
+ ```mlir
313
321
xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
314
322
l2_hint = #xegpu.cache_hint<write_back>,
315
323
l3_hint = #xegpu.cache_hint<write_through>}
@@ -407,21 +415,21 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
407
415
elements accessed for each offset, default is 1.
408
416
409
417
Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
410
- ```
418
+ ```mlir
411
419
%a = memref.alloc() : memref<1024xf32>
412
420
%1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
413
421
```
414
422
415
423
Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
416
424
It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
417
- ```
425
+ ```mlir
418
426
%0 = memref.alloc() : memref<1024xf32>
419
427
%1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
420
428
```
421
429
422
430
Example 3. It is similar to Example 2, but there is some overlaps among workitems.
423
431
It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
424
- ```
432
+ ```mlir
425
433
%0 = memref.alloc() : memref<1024xf32>
426
434
%1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
427
435
```
@@ -480,7 +488,7 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
480
488
it works on scattered TensorDesc instead.
481
489
482
490
Example:
483
- ```
491
+ ```mlir
484
492
xegpu.prefetch %tdesc {l1_hint = #xegpu.cache_hint<cached>,
485
493
l2_hint = #xegpu.cache_hint<cached>,
486
494
l3_hint = #xegpu.cache_hint<cached>}
@@ -520,7 +528,7 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
520
528
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
521
529
522
530
Example:
523
- ```
531
+ ```mlir
524
532
%2 = xegpu.load %1, %0 {transpose = [1, 0],
525
533
l1_hint = #xegpu.cache_hint<cached>,
526
534
l2_hint = #xegpu.cache_hint<uncached>,
@@ -572,7 +580,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDe
572
580
It has similar semantic to `load_gather`.
573
581
574
582
Example:
575
- ```
583
+ ```mlir
576
584
%3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
577
585
l2_hint = #xegpu.cache_hint<write_back>,
578
586
l3_hint = #xegpu.cache_hint<write_through>}
@@ -621,7 +629,7 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
621
629
shifts for each work-item.
622
630
623
631
Example:
624
- ```
632
+ ```mlir
625
633
%2 = xegpu.update_offset %1, [32, 32, 32, 32]
626
634
: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
627
635
```
@@ -668,14 +676,12 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
668
676
data type, the matrices are `A: vector<8x16xf16>`, `B: vector<16x16xf16>`,
669
677
and `C/D: vector<8x16xf32>`. Besides the matrix size requirements, DPAS
670
678
also requires A and B to be loaded with the required data layout. Specially,
671
- VNNI layout is required for B operand. It is achieved via setting `vnni_axis = 0`
672
- of the corresponding `load_nd` operator. To keep both operands as 3D vector,
673
- operand A is loaded via setting `vnni_axis = 1` without impacting the
674
- physical layouts change in register. Due to the VNNI transformation, A and B operands
675
- are represented as 3D vector, with the last dimension representing the VNNI factor,
676
- which is computed as `32/bit_width_of_elem_type`. Therefore, `A: vector<8x16xf16>`
677
- is represented as `A: vector<8x8x2xf16>`, and `B: vector<16x16xf16>` is
678
- represented as `B: vector<8x16x2xf16>`.
679
+
680
+ VNNI layout is required for B operand. It is achieved via adding `packed`
681
+ attribute to the `load_nd` operator. Due to the VNNI transformation, B operands
682
+ can be represented as a 3D vector, with the last dimension representing the VNNI
683
+ factor, which is computed as `32/bit_width_of_elem_type`. Thus, `B: vector<16x16xf16>`
684
+ can be represented as `B: vector<8x16x2xf16>`.
679
685
680
686
Note: on PVC, the hardware can perform load with VNNI transformation when data
681
687
element type is 16-bit or lower precision, taking 2 or 4 elements from
@@ -739,7 +745,7 @@ def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
739
745
740
746
let assemblyFormat = [{
741
747
$kind $tensorDesc `,` $mask `,` $value attr-dict `:`
742
- type($tensorDesc) `,` type($mask) `,` type($value) `->` type($result)
748
+ qualified( type($tensorDesc) ) `,` type($mask) `,` type($value) `->` type($result)
743
749
}];
744
750
}
745
751
0 commit comments