llvm · chencha3 · Oct 2, 2024 · Oct 1, 2024 · Oct 1, 2024 · Oct 1, 2024
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPU.h
@@ -11,6 +11,7 @@
 
 #include "mlir/Bytecode/BytecodeOpInterface.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
 #include "mlir/IR/TypeUtilities.h"

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -424,9 +424,9 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     It accepts the following parameters:
 
     * source: a 1D memref or pointer (uint64_t) represents the flattened memory object.
-    * offsets: a array containing offsets of each access point. Its size
+    * offsets: a vector containing offsets of each access point. Its size
       is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
-      implying each element in the array corresponds to a work-item (SIMT lane)
+      implying each element in the vector corresponds to a work-item (SIMT lane)
       in the subgroup.
 
     The first dimension of the result TensorDesc corresponds to work-items, so it should
@@ -436,56 +436,59 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
     Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
     ```mlir
     %a = memref.alloc() : memref<1024xf32>
-    %1 = xegpu.create_tdesc %a[0, 16, 32, 64]: memref<1024xf32> -> TensorDesc<4xf32>
+    %0 = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
+    %1 = xegpu.create_tdesc %a, %0: memref<1024xf32>, vector<4xindex> -> TensorDesc<4xf32>
     ```
 
     Example 2. It assumes subgroup size is 4, and each workitem access 8 elements.
                It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
     ```mlir
     %0 = memref.alloc() : memref<1024xf32>
-    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>
+    %off = arith.constant dense<[0, 16, 32, 64]> : vector<4xindex>
+    %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
+          -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
     ```
 
     Example 3. It is similar to Example 2, but there is some overlaps among workitems.
                It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
     ```mlir
     %0 = memref.alloc() : memref<1024xf32>
-    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>>
+    %off = arith.constant dense<[0, 4, 8, 12]> : vector<4xindex>
+    %1 = xegpu.create_tdesc %0, %off : memref<1024xf32>, vector<4xindex>
+          -> TensorDesc<4x8xf32, #xegpu.scattered_tdesc_attr<chunk_size = 8>>
     ```
   }];
 
   let arguments = (ins XeGPU_BaseAddrType: $source,
-                       Variadic<Index>: $offsets,
-                       DenseI64ArrayAttr: $const_offsets);
+                       XeGPU_OffsetType: $offsets);
   let results = (outs XeGPU_TensorDesc:$TensorDesc);
 
+  let builders = [
+    OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source,
+                   "llvm::ArrayRef<OpFoldResult>": $offsets)>,
+    OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "mlir::Value": $source,
+                   "llvm::ArrayRef<int64_t>": $offsets)>,
+  ];
+
   let assemblyFormat = [{
-    $source
-    custom<DynamicIndexList>($offsets, $const_offsets)
-    attr-dict `:`  type($source) `->` qualified(type($TensorDesc))
+    $source `,` $offsets attr-dict `:`  type($source) `,` type($offsets) `->` qualified(type($TensorDesc))
   }];
 
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
+  let extraClassDeclaration = [{
     xegpu::TensorDescType getTensorDescType() {
       return getTensorDesc().getType();
     }
 
-    SmallVector<OpFoldResult> getMixedOffsets() {
-      Builder b(getContext());
-      return getMixedValues(getConstOffsets(), getOffsets(), b);
+    mlir::VectorType getOffsetsType() {
+      return getOffsets().getType();
     }
 
     size_t getNumOffsets() {
-      return getMixedOffsets().size();
+      return getOffsetsType().getNumElements();
     }
 
     mlir::Value getViewSource() { return getSource(); }
 
-    OpFoldResult getOffset(unsigned idx) {
-      assert(idx < getNumOffsets() && "Invalid out of bound access.");
-      return getMixedOffsets()[idx];
-    }
-
     unsigned getSourceMemorySpace() {
       auto srcTy = getSource().getType();
       if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
@@ -550,24 +553,33 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
     describes the data being loaded at the subgroup level, so its size is
     consistent with the number of work-items in a subgroup. When the chunk size
     is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
-    to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item.
+    to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
     Specially, there is a transpose effect on the result (as compared to the TensorDesc)
     due to the hardware implementation. Therefore, a transpose attribute is introduced
     on purpose, making sure users are aware of this implicit transformation.
 
     The mask operand masks out memory access so that it is safe to pass out-of-boundary
     addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
 
-  Example:
+  Example 1:
   ```mlir
-    %2 = xegpu.load %1, %0 {transpose,
-                            l1_hint = #xegpu.cache_hint<cached>,
+    %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
                             l2_hint = #xegpu.cache_hint<uncached>,
                             l3_hint = #xegpu.cache_hint<uncached>}
           : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
             vector<16xi1> -> vector<16xf32>
   ```
 
+  Example 2:
+  ```mlir
+    %2 = xegpu.load %1, %0 {transpose,
+                            l1_hint = #xegpu.cache_hint<cached>,
+                            l2_hint = #xegpu.cache_hint<uncached>,
+                            l3_hint = #xegpu.cache_hint<uncached>}
+          : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
+            vector<16xi1> -> vector<8x16xf32>
+  ```
+
   }];
 
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
@@ -610,17 +622,27 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "T
   let description = [{ It (aka. store) stores data to scattered memory locations. The value is
   typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
   a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
-  and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter`
+  and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
   has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
   introduced on purpose, making sure users are aware of this implicit transformation.
 
-  Example:
+  Example 1:
   ```mlir
     %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
                                  l2_hint = #xegpu.cache_hint<write_back>,
                                  l3_hint = #xegpu.cache_hint<write_through>}
-          : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
+          : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
+  ```
+
+  Example 2:
+  ```mlir
+    %3 = xegpu.store %0, %1, %2 {transpose,
+                                 l1_hint = #xegpu.cache_hint<uncached>,
+                                 l2_hint = #xegpu.cache_hint<write_back>,
+                                 l3_hint = #xegpu.cache_hint<write_through>}
+          : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
   ```
+
   }];
 
   let arguments = (ins
@@ -666,40 +688,39 @@ def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
 
     Example:
     ```mlir
-      %2 = xegpu.update_offset %1, [32, 32, 32, 32]
-            : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+      %off = arith.constant dense<[32, 32, 32, 32]> : vector<4xindex>
+      %2 = xegpu.update_offset %1, %off :
+              !xegpu.tensor_desc<4x2xf32, #xegpu.scattered_tdesc_attr<>>, vector<4xindex>
     ```
   }];
 
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
-                       Variadic<Index>: $offsets,
-                       DenseI64ArrayAttr: $const_offsets);
+                       XeGPU_OffsetType: $offsets);
   let results = (outs XeGPU_TensorDesc: $result);
 
-  let extraClassDeclaration = extraBaseClassDeclaration # [{
+  let builders = [
+    OpBuilder<(ins "mlir::Value": $TensorDesc,
+                   "llvm::ArrayRef<OpFoldResult>": $offsets)>,
+    OpBuilder<(ins "mlir::Value": $TensorDesc,
+                   "llvm::ArrayRef<int64_t>": $offsets)>
+  ];
+
+  let extraClassDeclaration = [{
     xegpu::TensorDescType getTensorDescType() {
       return getTensorDesc().getType();
     }
 
-    SmallVector<OpFoldResult> getMixedOffsets() {
-      Builder b(getContext());
-      return getMixedValues(getConstOffsets(), getOffsets(), b);
+    mlir::VectorType getOffsetsType() {
+      return getOffsets().getType();
     }
 
     size_t getNumOffsets() {
-      return getMixedOffsets().size();
-    }
-
-    OpFoldResult getOffset(unsigned idx) {
-      assert(idx < getNumOffsets() && "Invalid out of bound access.");
-      return getMixedOffsets()[idx];
+      return getOffsetsType().getNumElements();
     }
   }];
 
   let assemblyFormat = [{
-    $TensorDesc `,`
-    custom<DynamicIndexList>($offsets, $const_offsets)
-    attr-dict `:` qualified(type($TensorDesc))
+    $TensorDesc `,` $offsets attr-dict `:` qualified(type($TensorDesc)) `,` type($offsets)
   }];
 }
 

diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Utils/StaticValueUtils.h"
 #include "mlir/Dialect/XeGPU/IR/XeGPU.h"
 #include "mlir/IR/Builders.h"
@@ -308,6 +309,24 @@ LogicalResult UpdateNdOffsetOp::verify() {
 // XeGPU_CreateDescOp
 //===----------------------------------------------------------------------===//
 
+void CreateDescOp::build(OpBuilder &builder, OperationState &state,
+                         TensorDescType TensorDesc, Value source,
+                         llvm::ArrayRef<OpFoldResult> offsets) {
+  auto loc = source.getLoc();
+  int64_t size = static_cast<int64_t>(offsets.size());
+  auto type = VectorType::get(size, builder.getIndexType());
+  auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
+  auto offset = builder.create<vector::FromElementsOp>(loc, type, values);
+  build(builder, state, TensorDesc, source, offset);
+}
+
+void CreateDescOp::build(OpBuilder &builder, OperationState &state,
+                         TensorDescType TensorDesc, Value source,
+                         llvm::ArrayRef<int64_t> offsets) {
+  auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets);
+  build(builder, state, TensorDesc, source, ofrs);
+}
+
 LogicalResult CreateDescOp::verify() {
   auto tdescTy = getTensorDescType();
 
@@ -473,6 +492,29 @@ LogicalResult StoreScatterOp::verify() {
 
   return success();
 }
+
+//===----------------------------------------------------------------------===//
+// XeGPU_UpdateOffsetOp
+//===----------------------------------------------------------------------===//
+void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
+                           mlir::Value tensorDesc,
+                           llvm::ArrayRef<OpFoldResult> offsets) {
+  auto tdescTy = mlir::dyn_cast<TensorDescType>(tensorDesc.getType());
+  assert(tdescTy && "Expecting the source is a TensorDescType value.");
+  auto loc = tensorDesc.getLoc();
+  int64_t size = static_cast<int64_t>(offsets.size());
+  auto type = VectorType::get({size}, builder.getIndexType());
+  auto values = getValueOrCreateConstantIndexOp(builder, loc, offsets);
+  auto offset = builder.create<vector::FromElementsOp>(loc, type, values);
+  build(builder, state, tdescTy, tensorDesc, offset);
+}
+
+void UpdateOffsetOp::build(OpBuilder &builder, OperationState &state,
+                           Value tensorDesc, llvm::ArrayRef<int64_t> offsets) {
+  auto ofrs = getAsIndexOpFoldResult(builder.getContext(), offsets);
+  build(builder, state, tensorDesc, ofrs);
+}
+
 //===----------------------------------------------------------------------===//
 // XeGPU_DpasOp
 //===----------------------------------------------------------------------===//