@@ -218,23 +218,6 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
218
218
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
219
219
220
220
mlir::Value getViewSource() { return getSource(); }
221
-
222
- unsigned getSourceMemorySpace() {
223
- auto srcTy = getSourceType();
224
- if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
225
- auto attr = memrefTy.getMemorySpace();
226
- if (attr) {
227
- if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr)) {
228
- return static_cast<unsigned>(intAttr.getInt());
229
- }
230
- if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
231
- return static_cast<unsigned>(memSpaceAttr.getValue());
232
- }
233
- }
234
- // take global as default memory scope.
235
- return static_cast<unsigned>(MemorySpace::Global);
236
- }
237
-
238
221
}];
239
222
}
240
223
@@ -428,10 +411,8 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
428
411
is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
429
412
implying each element in the array corresponds to a work-item (SIMT lane)
430
413
in the subgroup.
431
-
432
- The first dimension of the result TensorDesc corresponds to work-items, so it should
433
- match the dimension of offsets. It may also has a second dimension corresponding to
434
- the chunk_size if the chunk size is larger than 1.
414
+ * chunk_size: [optional attribute] indicates number of continious
415
+ elements accessed for each offset, default is 1.
435
416
436
417
Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
437
418
```mlir
@@ -443,22 +424,29 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
443
424
It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
444
425
```mlir
445
426
%0 = memref.alloc() : memref<1024xf32>
446
- %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8 >
427
+ %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8} : memref<1024xf32> -> TensorDesc<4x8xf32>
447
428
```
448
429
449
430
Example 3. It is similar to Example 2, but there is some overlaps among workitems.
450
431
It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
451
432
```mlir
452
433
%0 = memref.alloc() : memref<1024xf32>
453
- %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8> >
434
+ %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8} : memref<1024xf32> -> TensorDesc<4x8xf32>
454
435
```
455
436
}];
456
437
457
438
let arguments = (ins XeGPU_BaseAddrType: $source,
458
439
Variadic<Index>: $offsets,
459
- DenseI64ArrayAttr: $const_offsets);
440
+ DenseI64ArrayAttr: $const_offsets,
441
+ DefaultValuedAttr<I64Attr, "1">: $chunk_size);
460
442
let results = (outs XeGPU_TensorDesc:$TensorDesc);
461
443
444
+ let builders = [
445
+ OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
446
+ "llvm::ArrayRef<OpFoldResult>": $offsets,
447
+ CArg<"uint32_t", "1"> : $chunk_size)>,
448
+ ];
449
+
462
450
let assemblyFormat = [{
463
451
$source
464
452
custom<DynamicIndexList>($offsets, $const_offsets)
@@ -485,22 +473,6 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
485
473
assert(idx < getNumOffsets() && "Invalid out of bound access.");
486
474
return getMixedOffsets()[idx];
487
475
}
488
-
489
- unsigned getSourceMemorySpace() {
490
- auto srcTy = getSource().getType();
491
- if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
492
- auto attr = memrefTy.getMemorySpace();
493
- if (attr) {
494
- if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr))
495
- return static_cast<unsigned>(intAttr.getInt());
496
- if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
497
- return static_cast<unsigned>(memSpaceAttr.getValue());
498
- }
499
- }
500
- // take global as default memory scope.
501
- return static_cast<unsigned>(MemorySpace::Global);
502
- }
503
-
504
476
}];
505
477
506
478
let hasVerifier = 1;
@@ -548,31 +520,28 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
548
520
549
521
let description = [{ It (aka. load) load data per each work-item. The output
550
522
describes the data being loaded at the subgroup level, so its size is
551
- consistent with the number of work-items in a subgroup. When the chunk size
552
- is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
553
- to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item.
554
- Specially, there is a transpose effect on the result (as compared to the TensorDesc)
555
- due to the hardware implementation. Therefore, a transpose attribute is introduced
556
- on purpose, making sure users are aware of this implicit transformation.
523
+ consistent with the number of work-items in a subgroup. When `chunk_size_per_lane`
524
+ attribute is larger than 1 in TensorDesc, the output vector will be 2D vector,
525
+ with dim-1 correspoding to the chunk size.
557
526
558
527
The mask operand masks out memory access so that it is safe to pass out-of-boundary
559
528
addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
560
529
561
530
Example:
562
531
```mlir
563
- %2 = xegpu.load %1, %0 {transpose,
532
+ %2 = xegpu.load %1, %0 {transpose = [1, 0] ,
564
533
l1_hint = #xegpu.cache_hint<cached>,
565
534
l2_hint = #xegpu.cache_hint<uncached>,
566
535
l3_hint = #xegpu.cache_hint<uncached>}
567
- : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global >>,
568
- vector<16xi1> -> vector<16xf32>
536
+ : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true >>, vector<16xi1>
537
+ -> vector<16xf32>
569
538
```
570
539
571
540
}];
572
541
573
542
let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
574
543
XeGPU_MaskType: $mask,
575
- OptionalAttr<UnitAttr >: $transpose,
544
+ OptionalAttr<DenseI64ArrayAttr >: $transpose,
576
545
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
577
546
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
578
547
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -604,15 +573,11 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
604
573
let hasVerifier = 1;
605
574
}
606
575
607
- def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch <["value", "TensorDesc"]>,
608
- AllElementTypesMatch<["value", "TensorDesc"]>]> {
576
+ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch <["value", "TensorDesc"]>,
577
+ AllElementTypesMatch<["value", "TensorDesc"]>]> {
609
578
let summary = "store data to scattered memory locations.";
610
- let description = [{ It (aka. store) stores data to scattered memory locations. The value is
611
- typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
612
- a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
613
- and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter`
614
- has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
615
- introduced on purpose, making sure users are aware of this implicit transformation.
579
+ let description = [{ It (aka. store) stores data to scattered memory locations.
580
+ It has similar semantic to `load_gather`.
616
581
617
582
Example:
618
583
```mlir
@@ -627,7 +592,6 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "T
627
592
XeGPU_ValueType: $value,
628
593
XeGPU_TensorDesc: $TensorDesc,
629
594
XeGPU_MaskType: $mask,
630
- OptionalAttr<UnitAttr>: $transpose,
631
595
OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
632
596
OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
633
597
OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -759,7 +723,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
759
723
760
724
def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
761
725
AllElementTypesMatch<["tensorDesc", "value", "result"]>,
762
- AllShapesMatch<["tensorDesc", "value", "result"]>]> {
726
+ AllShapesMatch<["tensorDesc", "mask", " value", "result"]>]> {
763
727
let summary = "Atomic ready-modify-write operation on the TensorDesc. ";
764
728
765
729
let description = [{
@@ -844,7 +808,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
844
808
2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be
845
809
within each workgroup. "GPU" means the scope would be across workgroups within the GPU.
846
810
}];
847
- let arguments = (ins XeGPU_MemorySpaceAttr : $memory_kind,
811
+ let arguments = (ins XeGPU_MemoryScopeAttr : $memory_kind,
848
812
XeGPU_FenceScopeAttr: $fence_scope);
849
813
let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}];
850
814
let extraClassDeclaration = extraBaseClassDeclaration;
0 commit comments