@@ -424,9 +424,9 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
424
424
It accepts the following parameters:
425
425
426
426
* source: a 1D memref or pointer (uint64_t) represents the flattened memory object.
427
- * offsets: a array containing offsets of each access point. Its size
427
+ * offsets: a vector containing offsets of each access point. Its size
428
428
is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
429
- implying each element in the array corresponds to a work-item (SIMT lane)
429
+ implying each element in the vector corresponds to a work-item (SIMT lane)
430
430
in the subgroup.
431
431
432
432
The first dimension of the result TensorDesc corresponds to work-items, so it should
@@ -436,56 +436,59 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
436
436
Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
437
437
```mlir
438
438
%a = memref.alloc() : memref<1024xf32>
439
- %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
439
+ %0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
440
+ %1 = xegpu.create_tdesc %a, %0: memref<1024xf32>, vector<4xindex> -> TensorDesc<4xf32>
440
441
```
441
442
442
443
Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
443
444
It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
444
445
```mlir
445
446
%0 = memref.alloc() : memref<1024xf32>
446
- %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>
447
+ %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
448
+ %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
449
+ -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
447
450
```
448
451
449
452
Example 3. It is similar to Example 2, but there is some overlaps among workitems.
450
453
It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
451
454
```mlir
452
455
%0 = memref.alloc() : memref<1024xf32>
453
- %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>>
456
+ %off = arith.constant dense<[0, 4, 8, 12]> : vector<4xindex>
457
+ %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
458
+ -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
454
459
```
455
460
}];
456
461
457
462
let arguments = (ins XeGPU_BaseAddrType: $source,
458
- Variadic<Index>: $offsets,
459
- DenseI64ArrayAttr: $const_offsets);
463
+ XeGPU_OffsetType: $offsets);
460
464
let results = (outs XeGPU_TensorDesc:$TensorDesc);
461
465
466
+ let builders = [
467
+ OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source,
468
+ "llvm::ArrayRef<OpFoldResult>": $offsets)>,
469
+ OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source,
470
+ "llvm::ArrayRef<int64_t>": $offsets)>,
471
+ ];
472
+
462
473
let assemblyFormat = [{
463
- $source
464
- custom<DynamicIndexList>($offsets, $const_offsets)
465
- attr-dict `:` type($source) `->` qualified(type($TensorDesc))
474
+ $source `,` $offsets attr-dict `:` type($source) `,` type($offsets) `->` qualified(type($TensorDesc))
466
475
}];
467
476
468
- let extraClassDeclaration = extraBaseClassDeclaration # [{
477
+ let extraClassDeclaration = [{
469
478
xegpu::TensorDescType getTensorDescType() {
470
479
return getTensorDesc().getType();
471
480
}
472
481
473
- SmallVector<OpFoldResult> getMixedOffsets() {
474
- Builder b(getContext());
475
- return getMixedValues(getConstOffsets(), getOffsets(), b);
482
+ mlir::VectorType getOffsetsType() {
483
+ return getOffsets().getType();
476
484
}
477
485
478
486
size_t getNumOffsets() {
479
- return getMixedOffsets ().size ();
487
+ return getOffsetsType ().getNumElements ();
480
488
}
481
489
482
490
mlir::Value getViewSource() { return getSource(); }
483
491
484
- OpFoldResult getOffset(unsigned idx) {
485
- assert(idx < getNumOffsets() && "Invalid out of bound access.");
486
- return getMixedOffsets()[idx];
487
- }
488
-
489
492
unsigned getSourceMemorySpace() {
490
493
auto srcTy = getSource().getType();
491
494
if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
@@ -550,24 +553,33 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
550
553
describes the data being loaded at the subgroup level, so its size is
551
554
consistent with the number of work-items in a subgroup. When the chunk size
552
555
is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
553
- to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item.
556
+ to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
554
557
Specially, there is a transpose effect on the result (as compared to the TensorDesc)
555
558
due to the hardware implementation. Therefore, a transpose attribute is introduced
556
559
on purpose, making sure users are aware of this implicit transformation.
557
560
558
561
The mask operand masks out memory access so that it is safe to pass out-of-boundary
559
562
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
560
563
561
- Example:
564
+ Example 1 :
562
565
```mlir
563
- %2 = xegpu.load %1, %0 {transpose,
564
- l1_hint = #xegpu.cache_hint<cached>,
566
+ %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
565
567
l2_hint = #xegpu.cache_hint<uncached>,
566
568
l3_hint = #xegpu.cache_hint<uncached>}
567
569
: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
568
570
vector<16xi1> -> vector<16xf32>
569
571
```
570
572
573
+ Example 2:
574
+ ```mlir
575
+ %2 = xegpu.load %1, %0 {transpose,
576
+ l1_hint = #xegpu.cache_hint<cached>,
577
+ l2_hint = #xegpu.cache_hint<uncached>,
578
+ l3_hint = #xegpu.cache_hint<uncached>}
579
+ : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
580
+ vector<16xi1> -> vector<8x16xf32>
581
+ ```
582
+
571
583
}];
572
584
573
585
let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
@@ -610,17 +622,27 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "T
610
622
let description = [{ It (aka. store) stores data to scattered memory locations. The value is
611
623
typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
612
624
a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
613
- and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter`
625
+ and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
614
626
has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
615
627
introduced on purpose, making sure users are aware of this implicit transformation.
616
628
617
- Example:
629
+ Example 1 :
618
630
```mlir
619
631
%3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
620
632
l2_hint = #xegpu.cache_hint<write_back>,
621
633
l3_hint = #xegpu.cache_hint<write_through>}
622
- : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
634
+ : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
635
+ ```
636
+
637
+ Example 2:
638
+ ```mlir
639
+ %3 = xegpu.store %0, %1, %2 {transpose,
640
+ l1_hint = #xegpu.cache_hint<uncached>,
641
+ l2_hint = #xegpu.cache_hint<write_back>,
642
+ l3_hint = #xegpu.cache_hint<write_through>}
643
+ : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
623
644
```
645
+
624
646
}];
625
647
626
648
let arguments = (ins
@@ -666,40 +688,39 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
666
688
667
689
Example:
668
690
```mlir
669
- %2 = xegpu.update_offset %1, [32, 32, 32, 32]
670
- : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
691
+ %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
692
+ %2 = xegpu.update_offset %1, %off :
693
+ !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<>>, vector<4xindex>
671
694
```
672
695
}];
673
696
674
697
let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
675
- Variadic<Index>: $offsets,
676
- DenseI64ArrayAttr: $const_offsets);
698
+ XeGPU_OffsetType: $offsets);
677
699
let results = (outs XeGPU_TensorDesc: $result);
678
700
679
- let extraClassDeclaration = extraBaseClassDeclaration # [{
701
+ let builders = [
702
+ OpBuilder<(ins "mlir::Value": $TensorDesc,
703
+ "llvm::ArrayRef<OpFoldResult>": $offsets)>,
704
+ OpBuilder<(ins "mlir::Value": $TensorDesc,
705
+ "llvm::ArrayRef<int64_t>": $offsets)>
706
+ ];
707
+
708
+ let extraClassDeclaration = [{
680
709
xegpu::TensorDescType getTensorDescType() {
681
710
return getTensorDesc().getType();
682
711
}
683
712
684
- SmallVector<OpFoldResult> getMixedOffsets() {
685
- Builder b(getContext());
686
- return getMixedValues(getConstOffsets(), getOffsets(), b);
713
+ mlir::VectorType getOffsetsType() {
714
+ return getOffsets().getType();
687
715
}
688
716
689
717
size_t getNumOffsets() {
690
- return getMixedOffsets().size();
691
- }
692
-
693
- OpFoldResult getOffset(unsigned idx) {
694
- assert(idx < getNumOffsets() && "Invalid out of bound access.");
695
- return getMixedOffsets()[idx];
718
+ return getOffsetsType().getNumElements();
696
719
}
697
720
}];
698
721
699
722
let assemblyFormat = [{
700
- $TensorDesc `,`
701
- custom<DynamicIndexList>($offsets, $const_offsets)
702
- attr-dict `:` qualified(type($TensorDesc))
723
+ $TensorDesc `,` $offsets attr-dict `:` qualified(type($TensorDesc)) `,` type($offsets)
703
724
}];
704
725
}
705
726
0 commit comments