-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[mlir][XeGPU] Add MemoryEffectsOpInterface for XeGPU memory related ops. #125314
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-mlir @llvm/pr-subscribers-mlir-gpu Author: Charitha Saumya (charithaintc) ChangesPatch is 21.18 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/125314.diff 2 Files Affected:
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index c2335eecc3781d..d98aa9ffb26f1a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -276,97 +276,103 @@ def XeGPU_PrefetchNdOp : XeGPU_Op<"prefetch_nd", []> {
}
-def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
- let summary = "loads a n-D block from memory (represented by TensorDesc)"
- "to registers (represented by vector)";
- let description = [{
- LoadNdOp essentially mimics the hardware block read instruction to read
- a block of data from memory to register. It takes a set of optional cache
- hints for each level of cache, L1, L2 and L3. If hardware does not have a
- correspoding cache, Corresponding cache hint attribute will be masked.
- VNNI transformation is an hardware feature for Intel GPU, which is used to
- do data packing during the load for B operand of matrix operation, if
- the bit width of the data type is less then 32 bits, e.g., fp16. And
- transpose is another Intel hardware feature, which will do transpose
- operation when loading the data if the bit width of the data type is
- fp32 or fp64. It implies that vnni and transpose cannot exit at the
- same time.
-
- Example:
- ```mlir
- xegpu.load_nd %1 {transpose = [1, 0],
- l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>,
- l3_hint = #xegpu.cache_hint<streaming>}
- : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
- ```
-
-
- }];
-
- let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
- OptionalAttr<UnitAttr>: $packed,
- OptionalAttr<DenseI64ArrayAttr>: $transpose,
- OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
- let results = (outs XeGPU_ValueType: $value);
-
- let extraClassDeclaration = extraBaseClassDeclaration # [{
- VectorType getType() {
- return llvm::dyn_cast<VectorType>(getValue().getType());
- }
+def XeGPU_LoadNdOp : XeGPU_Op<"load_nd", [
+ AllElementTypesMatch<["value", "TensorDesc"]>,
+ DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+ ]> {
+ let summary = "loads a n-D block from memory (represented by TensorDesc)"
+ "to registers (represented by vector)";
+ let description = [{
+ LoadNdOp essentially mimics the hardware block read instruction to read
+ a block of data from memory to register. It takes a set of optional cache
+ hints for each level of cache, L1, L2 and L3. If hardware does not have a
+ correspoding cache, Corresponding cache hint attribute will be masked.
+ VNNI transformation is an hardware feature for Intel GPU, which is used to
+ do data packing during the load for B operand of matrix operation, if
+ the bit width of the data type is less then 32 bits, e.g., fp16. And
+ transpose is another Intel hardware feature, which will do transpose
+ operation when loading the data if the bit width of the data type is
+ fp32 or fp64. It implies that vnni and transpose cannot exit at the
+ same time.
+
+ Example:
+ ```mlir
+ xegpu.load_nd %1 {transpose = [1, 0],
+ l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<uncached>,
+ l3_hint = #xegpu.cache_hint<streaming>}
+ : !xegpu.tensor_desc<8x16xf32> -> vector<16x8xf32>
+ ```
+
+
+ }];
+
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<UnitAttr>: $packed,
+ OptionalAttr<DenseI64ArrayAttr>: $transpose,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+ let results = (outs XeGPU_ValueType: $value);
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ VectorType getType() {
+ return llvm::dyn_cast<VectorType>(getValue().getType());
+ }
- xegpu::TensorDescType getTensorDescType() {
- return getTensorDesc().getType();
- }
- }];
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+ }];
- let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
- let hasVerifier = 1;
+ let assemblyFormat = "$TensorDesc prop-dict attr-dict `:` qualified(type($TensorDesc)) `->` type($value)";
+ let hasVerifier = 1;
}
-def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
- let summary = "stores a n-D block register region back to memory, currently only supports 2D";
-
- let description = [{
- StoreNdOp essentially mimics the hardware block write instruction io
- write a block of data from register into the memory region as described
- by the TensorDesc. It takes a set of optional cache hints for each level
- of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
- Corresponding cache hint attribute will be masked.
-
- Example:
- ```mlir
- xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
- l2_hint = #xegpu.cache_hint<write_back>,
- l3_hint = #xegpu.cache_hint<write_through>}
- : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
- ```
-
-
- }];
-
- let arguments = (ins XeGPU_ValueType: $value,
- XeGPU_TensorDesc: $TensorDesc,
- OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
-
- let extraClassDeclaration = extraBaseClassDeclaration # [{
- VectorType getValueType() {
- return llvm::dyn_cast<VectorType>(getValue().getType());
- }
+def XeGPU_StoreNdOp : XeGPU_Op<"store_nd", [
+ AllElementTypesMatch<["value", "TensorDesc"]>,
+ DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+ ]> {
+ let summary = "stores a n-D block register region back to memory, currently only supports 2D";
+
+ let description = [{
+ StoreNdOp essentially mimics the hardware block write instruction io
+ write a block of data from register into the memory region as described
+ by the TensorDesc. It takes a set of optional cache hints for each level
+ of cache, L1, L2 and L3. If hardware does not have a correspoding cache,
+ Corresponding cache hint attribute will be masked.
+
+ Example:
+ ```mlir
+ xegpu.store_nd %3, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+ l2_hint = #xegpu.cache_hint<write_back>,
+ l3_hint = #xegpu.cache_hint<write_through>}
+ : vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16>
+ ```
+
+
+ }];
+
+ let arguments = (ins XeGPU_ValueType: $value,
+ XeGPU_TensorDesc: $TensorDesc,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ VectorType getValueType() {
+ return llvm::dyn_cast<VectorType>(getValue().getType());
+ }
- xegpu::TensorDescType getTensorDescType() {
- return getTensorDesc().getType();
- }
- }];
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
+ }];
- let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
- `:` type($value) `,` qualified(type($TensorDesc))}];
- let hasVerifier = 1;
+ let assemblyFormat = [{$value `,` $TensorDesc prop-dict attr-dict
+ `:` type($value) `,` qualified(type($TensorDesc))}];
+ let hasVerifier = 1;
}
def XeGPU_UpdateNdOffsetOp : XeGPU_Op<"update_nd_offset",
@@ -548,131 +554,138 @@ def XeGPU_PrefetchOp : XeGPU_Op<"prefetch", []> {
let hasVerifier = 1;
}
-def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
- let summary = "load a set of scattered data points from memory.";
-
- let description = [{ It (aka. load) load data per each work-item. The output
- describes the data being loaded at the subgroup level, so its size is
- consistent with the number of work-items in a subgroup. When the chunk size
- is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
- to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
- Specially, there is a transpose effect on the result (as compared to the TensorDesc)
- due to the hardware implementation. Therefore, a transpose attribute is introduced
- on purpose, making sure users are aware of this implicit transformation.
-
- The mask operand masks out memory access so that it is safe to pass out-of-boundary
- addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
-
- Example 1:
- ```mlir
- %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>,
- l3_hint = #xegpu.cache_hint<uncached>}
- : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
- vector<16xi1> -> vector<16xf32>
- ```
+def XeGPU_LoadGatherOp : XeGPU_Op<"load", [
+ AllElementTypesMatch<["value", "TensorDesc"]>,
+ DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+ ]> {
+ let summary = "load a set of scattered data points from memory.";
+
+ let description = [{ It (aka. load) load data per each work-item. The output
+ describes the data being loaded at the subgroup level, so its size is
+ consistent with the number of work-items in a subgroup. When the chunk size
+ is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
+ to work-items, and dim-0 corresponding to the chunk size loaded by each work-item.
+ Specially, there is a transpose effect on the result (as compared to the TensorDesc)
+ due to the hardware implementation. Therefore, a transpose attribute is introduced
+ on purpose, making sure users are aware of this implicit transformation.
+
+ The mask operand masks out memory access so that it is safe to pass out-of-boundary
+ addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
+
+ Example 1:
+ ```mlir
+ %2 = xegpu.load %1, %0 {l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<uncached>,
+ l3_hint = #xegpu.cache_hint<uncached>}
+ : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
+ vector<16xi1> -> vector<16xf32>
+ ```
- Example 2:
- ```mlir
- %2 = xegpu.load %1, %0 {transpose,
- l1_hint = #xegpu.cache_hint<cached>,
- l2_hint = #xegpu.cache_hint<uncached>,
- l3_hint = #xegpu.cache_hint<uncached>}
- : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
- vector<16xi1> -> vector<8x16xf32>
- ```
+ Example 2:
+ ```mlir
+ %2 = xegpu.load %1, %0 {transpose,
+ l1_hint = #xegpu.cache_hint<cached>,
+ l2_hint = #xegpu.cache_hint<uncached>,
+ l3_hint = #xegpu.cache_hint<uncached>}
+ : !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<memory_space=global, chunk_size=8>>,
+ vector<16xi1> -> vector<8x16xf32>
+ ```
- }];
+ }];
- let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
- XeGPU_MaskType: $mask,
- OptionalAttr<UnitAttr>: $transpose,
- OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
- let results = (outs XeGPU_ValueType: $value);
+ let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
+ XeGPU_MaskType: $mask,
+ OptionalAttr<UnitAttr>: $transpose,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+ let results = (outs XeGPU_ValueType: $value);
- let extraClassDeclaration = extraBaseClassDeclaration # [{
- xegpu::TensorDescType getTensorDescType() {
- return getTensorDesc().getType();
- }
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
- mlir::Type getElementType() {
- auto type = getValue().getType();
- return getElementTypeOrSelf(type);
- }
+ mlir::Type getElementType() {
+ auto type = getValue().getType();
+ return getElementTypeOrSelf(type);
+ }
- Type getValueType() {
- return getValue().getType();
- }
+ Type getValueType() {
+ return getValue().getType();
+ }
- Type getMaskType() {
- return getMask().getType();
- }
+ Type getMaskType() {
+ return getMask().getType();
+ }
- }];
+ }];
- let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
- `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
+ let assemblyFormat = [{$TensorDesc `,` $mask prop-dict attr-dict
+ `:` qualified(type($TensorDesc)) `,` type($mask) `->` type($value)}];
- let hasVerifier = 1;
+ let hasVerifier = 1;
}
-def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementTypesMatch<["value", "TensorDesc"]>]> {
- let summary = "store data to scattered memory locations.";
- let description = [{ It (aka. store) stores data to scattered memory locations. The value is
- typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
- a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
- and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
- has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
- introduced on purpose, making sure users are aware of this implicit transformation.
-
- Example 1:
- ```mlir
- %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
- l2_hint = #xegpu.cache_hint<write_back>,
- l3_hint = #xegpu.cache_hint<write_through>}
- : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
- ```
+def XeGPU_StoreScatterOp : XeGPU_Op<"store",
+ [
+ AllElementTypesMatch<["value", "TensorDesc"]>,
+ DeclareOpInterfaceMethods<MemoryEffectsOpInterface>
+ ]> {
+ let summary = "store data to scattered memory locations.";
+ let description = [{ It (aka. store) stores data to scattered memory locations. The value is
+ typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
+ a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
+ and the dim-0 of the value corresponds to the chunk size stored per lane. So `store_scatter`
+ has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
+ introduced on purpose, making sure users are aware of this implicit transformation.
+
+ Example 1:
+ ```mlir
+ %3 = xegpu.store %0, %1, %2 {l1_hint = #xegpu.cache_hint<uncached>,
+ l2_hint = #xegpu.cache_hint<write_back>,
+ l3_hint = #xegpu.cache_hint<write_through>}
+ : vector<16xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scattered_tdesc_attr<>>, vector<16xi1>
+ ```
- Example 2:
- ```mlir
- %3 = xegpu.store %0, %1, %2 {transpose,
- l1_hint = #xegpu.cache_hint<uncached>,
- l2_hint = #xegpu.cache_hint<write_back>,
- l3_hint = #xegpu.cache_hint<write_through>}
- : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
- ```
+ Example 2:
+ ```mlir
+ %3 = xegpu.store %0, %1, %2 {transpose,
+ l1_hint = #xegpu.cache_hint<uncached>,
+ l2_hint = #xegpu.cache_hint<write_back>,
+ l3_hint = #xegpu.cache_hint<write_through>}
+ : vector<8x16xf32>, !xegpu.tensor_desc<16x8xf32, #xegpu.scattered_tdesc_attr<chunk_size=8>>, vector<16xi1>
+ ```
- }];
+ }];
- let arguments = (ins
- XeGPU_ValueType: $value,
- XeGPU_TensorDesc: $TensorDesc,
- XeGPU_MaskType: $mask,
- OptionalAttr<UnitAttr>: $transpose,
- OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
- OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
+ let arguments = (ins
+ XeGPU_ValueType: $value,
+ XeGPU_TensorDesc: $TensorDesc,
+ XeGPU_MaskType: $mask,
+ OptionalAttr<UnitAttr>: $transpose,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
+ OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
- let extraClassDeclaration = extraBaseClassDeclaration # [{
- xegpu::TensorDescType getTensorDescType() {
- return getTensorDesc().getType();
- }
+ let extraClassDeclaration = extraBaseClassDeclaration # [{
+ xegpu::TensorDescType getTensorDescType() {
+ return getTensorDesc().getType();
+ }
- Type getValueType() {
- return getValue().getType();
- }
+ Type getValueType() {
+ return getValue().getType();
+ }
- Type getMaskType() {
- return getMask().getType();
- }
- }];
+ Type getMaskType() {
+ return getMask().getType();
+ }
+ }];
- let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
- `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
+ let assemblyFormat = [{$value `,` $TensorDesc `,` $mask prop-dict attr-dict
+ `:` type($value) `,` qualified(type($TensorDesc)) `,` type($mask)}];
- let hasVerifier = 1;
+ let hasVerifier = 1;
}
def XeGPU_UpdateOffsetOp: XeGPU_Op<"update_offset",
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index cd883baa986b85..d015e5772a94f2 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -324,6 +324,12 @@ LogicalResult LoadNdOp::verify() {
return success();
}
+void LoadNdOp::getEffects(
+ SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+ &effects) {
+ effects.emplace_back(MemoryEffects::Read::get());
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_StoreNdOp
//===----------------------------------------------------------------------===//
@@ -361,6 +367,12 @@ LogicalResult StoreNdOp::verify() {
return success();
}
+void StoreNdOp::getEffects(
+ SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+ &effects) {
+ effects.emplace_back(MemoryEffects::Write::get());
+}
+
//===----------------------------------------------------------------------===//
// XeGPU_UpdateNDOffsetOp
//===----------------------------------------------------------------------===//
@@ -494,7 +506,7 @@ LogicalResult PrefetchOp::verify() {
...
[truncated]
|
@chencha3 @adam-smnk @Jianhui-Li Can you please take a look? |
91fa249
to
ffae029
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks for the updates
@adam-smnk Thanks for the merge! |
…ps. (llvm#125314) `MemoryEffectsOpInterface` is needed for cleaning up the remaining (dead) operations after XeGPU distribution into SIMT.
MemoryEffectsOpInterface
is needed for cleaning up the remaining (dead) operations after XeGPU distribution into SIMT.