[mlir][vector]add extractInsertFoldConstantOp fold function and apply it to extractOp and insertOp. #124399

linuxlonelyeagle · 2025-01-25T09:00:50Z

What does this PR achieve ?
It implements the constant folding functionality of insertOp and extractOp.

// for example
%0 = arith.constant 0 : index
%1 = vector.extract %v[%0,%0] : i32 from vector<32x1xi32>

// after fold
%1 = vector.extract %v[0,0] : i32 from vector<32x1xi32>

Why implement this feature?
I am implementing efficient Gemm on GPU (currently using cuda core), and when using registers, I should use the vector abstraction level.
Consider the following example.It cannot be lowered to llvm ir. The fundamental reason is that insertOp and extractOp have the above vector.extract %v[%0,%0] : i32 from vector<32x1xi32> situation.The fundamental purpose of this PR is to solve this problem.
mlir-opt vector-extract-insert.mlir -affine-loop-unroll=“unroll-full=true unroll-num-reps=10” -convert -arith-to-llvm -convert-vector-to-llvm

func.func @main() -> i32 {
  %c = arith.constant dense<0> : vector<32x1xi32>
  %a = arith.constant dense<1> : vector<32x1xi32>
  %b = arith.constant dense<1> : vector<1x1xi32>
  %c_0 = arith.constant 0 : index
  %y = affine.for %arg_0 = 0 to 32 iter_args(%it_0 = %c) -> vector<32x1xi32> {
    %y_0 = affine.for %arg_1 = 0 to 1 iter_args(%it_1 = %it_0) -> vector<32x1xi32> {
      %c_elem = vector.extract %it_1[%arg_0, %arg_1] : i32 from vector<32x1xi32>
      %a_elem = vector.extract %a[%arg_0, %c_0] : i32 from vector<32x1xi32>
      %b_elem = vector.extract %b[%arg_1, %c_0] : i32 from vector<1x1xi32>
      %mul = arith.muli %a_elem, %b_elem : i32
      %ret = arith.addi %mul, %c_elem : i32 
      %y_1 = vector.insert %ret, %it_1[%arg_0, %arg_1] : i32 into vector<32x1xi32>
      affine.yield %y_1 : vector<32x1xi32>
    }
    affine.yield %y_0 : vector<32x1xi32>
  } 
  vector.print %c : vector<32x1xi32>
  vector.print %y : vector<32x1xi32>
  %ret = arith.constant 0 : i32
  return %ret : i32
}

llvmbot · 2025-01-25T09:02:08Z

@llvm/pr-subscribers-mlir-vector

@llvm/pr-subscribers-mlir

Author: lonely eagle (linuxlonelyeagle)

Changes

see #121631
Ping: @Groverkss @ftynse I took a more clever approach to implement it, which I think should be good.

Full diff: https://github.com/llvm/llvm-project/pull/124399.diff

3 Files Affected:

(modified) mlir/lib/Dialect/Vector/IR/VectorOps.cpp (+45)
(modified) mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir (+36)
(modified) mlir/test/Dialect/Vector/vector-warp-distribute.mlir (+1-2)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 3fbfcb4979b495..5021b097fc5ef6 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
@@ -1977,6 +1978,46 @@ static Value foldScalarExtractFromFromElements(ExtractOp extractOp) {
   return fromElementsOp.getElements()[flatIndex];
 }
 
+// If the dynamic operands of `extractOp` or `insertOp` is result of
+// `constantOp`, then fold it.
+template <typename T>
+static void foldConstantOp(T op, SmallVectorImpl<Value> &operands) {
+  auto staticPosition = op.getStaticPosition().vec();
+  OperandRange dynamicPosition = op.getDynamicPosition();
+
+  // If the dynamic operands is empty, it is returned directly.
+  if (!dynamicPosition.size())
+    return;
+  unsigned index = 0;
+
+  // `opChange` is a flog. If it is true, it means to update `op` in place.
+  bool opChange = false;
+  for (unsigned i = 0, e = staticPosition.size(); i < e; ++i) {
+    if (!ShapedType::isDynamic(staticPosition[i]))
+      continue;
+    Value position = dynamicPosition[index++];
+
+    // If it is a block parameter, proceed to the next iteration.
+    if (!position.getDefiningOp()) {
+      operands.push_back(position);
+      continue;
+    }
+
+    if (auto constantOp =
+            mlir::dyn_cast<arith::ConstantIndexOp>(position.getDefiningOp())) {
+      opChange = true;
+      staticPosition[i] = constantOp.value();
+      continue;
+    }
+    operands.push_back(position);
+  }
+
+  if (opChange) {
+    op.setStaticPosition(staticPosition);
+    op.getOperation()->setOperands(operands);
+  }
+}
+
 OpFoldResult ExtractOp::fold(FoldAdaptor) {
   // Fold "vector.extract %v[] : vector<2x2xf32> from vector<2x2xf32>" to %v.
   // Note: Do not fold "vector.extract %v[] : f32 from vector<f32>" (type
@@ -1999,6 +2040,8 @@ OpFoldResult ExtractOp::fold(FoldAdaptor) {
     return val;
   if (auto val = foldScalarExtractFromFromElements(*this))
     return val;
+  SmallVector<Value> operands = {getVector()};
+  foldConstantOp(*this, operands);
   return OpFoldResult();
 }
 
@@ -3028,6 +3071,8 @@ OpFoldResult vector::InsertOp::fold(FoldAdaptor adaptor) {
   // (type mismatch).
   if (getNumIndices() == 0 && getSourceType() == getType())
     return getSource();
+  SmallVector<Value> operands = {getSource(), getDest()};
+  foldConstantOp(*this, operands);
   return {};
 }
 
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 29bed9aae56827..f8f5f9039bb146 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -4115,3 +4115,39 @@ func.func @step_scalable() -> vector<[4]xindex> {
   %0 = vector.step : vector<[4]xindex>
   return %0 : vector<[4]xindex>
 }
+
+// -----
+
+// CHECK-LABEL: @extract_arith_constnt
+func.func @extract_arith_constnt() -> i32 {
+  %v = arith.constant dense<0> : vector<32x1xi32>
+  %c_0 = arith.constant 0 : index
+  %elem = vector.extract %v[%c_0, %c_0] : i32 from vector<32x1xi32>
+  return %elem : i32
+}
+
+// CHECK: %[[VAL_0:.*]] = arith.constant dense<0> : vector<32x1xi32>
+// CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[VAL_0]] : vector<32x1xi32> to !llvm.array<32 x vector<1xi32>>
+// CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_1]][0] : !llvm.array<32 x vector<1xi32>>
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK: %{{.*}} = llvm.extractelement %[[VAL_2]]{{\[}}%[[VAL_3]] : i64] : vector<1xi32>
+
+// -----
+
+// CHECK-LABEL: @insert_arith_constnt()
+
+func.func @insert_arith_constnt() -> vector<32x1xi32> {
+  %v = arith.constant dense<0> : vector<32x1xi32>
+  %c_0 = arith.constant 0 : index
+  %c_1 = arith.constant 1 : i32
+  %v_1 = vector.insert %c_1, %v[%c_0, %c_0] : i32 into vector<32x1xi32>
+  return %v_1 : vector<32x1xi32>
+}
+    
+// CHECK: %[[VAL_0:.*]] = arith.constant dense<0> : vector<32x1xi32>
+// CHECK: %[[VAL_1:.*]] = builtin.unrealized_conversion_cast %[[VAL_0]] : vector<32x1xi32> to !llvm.array<32 x vector<1xi32>>
+// CHECK: %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK: %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_1]][0] : !llvm.array<32 x vector<1xi32>>
+// CHECK: %[[VAL_4:.*]] = llvm.mlir.constant(0 : i64) : i64
+// CHECK: %[[VAL_5:.*]] = llvm.insertelement %[[VAL_2]], %[[VAL_3]]{{\[}}%[[VAL_4]] : i64] : vector<1xi32>
+// CHECK: %{{.*}} = llvm.insertvalue %[[VAL_5]], %[[VAL_1]][0] : !llvm.array<32 x vector<1xi32>>
diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
index dbe0b39422369c..38771f25934495 100644
--- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -778,12 +778,11 @@ func.func @warp_constant(%laneid: index) -> (vector<1xf32>) {
 
 // CHECK-PROP-LABEL: func.func @vector_extract_1d(
 //   CHECK-PROP-DAG:   %[[C5_I32:.*]] = arith.constant 5 : i32
-//   CHECK-PROP-DAG:   %[[C1:.*]] = arith.constant 1 : index
 //       CHECK-PROP:   %[[R:.*]] = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<2xf32>) {
 //       CHECK-PROP:     %[[V:.*]] = "some_def"() : () -> vector<64xf32>
 //       CHECK-PROP:     gpu.yield %[[V]] : vector<64xf32>
 //       CHECK-PROP:   }
-//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[R]][%[[C1]]] : f32 from vector<2xf32>
+//       CHECK-PROP:   %[[E:.*]] = vector.extract %[[R]][1] : f32 from vector<2xf32>
 //       CHECK-PROP:   %[[SHUFFLED:.*]], %{{.*}} = gpu.shuffle  idx %[[E]], %[[C5_I32]]
 //       CHECK-PROP:   return %[[SHUFFLED]] : f32
 func.func @vector_extract_1d(%laneid: index) -> (f32) {

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir

Groverkss

Thank you! This is good, just some test change comments.

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

ftynse

Thank you! The overall approach looks reasonable, but the implementation can be simplified.

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir

linuxlonelyeagle · 2025-01-26T18:33:30Z

I seem to be having some problems, I don't seem to be able to turn on integration testing.

cmake -G Ninja ../llvm \                     
    -DLLVM_ENABLE_PROJECTS="mlir;" \
    -DLLVM_TARGETS_TO_BUILD="host;NVPTX" \
    -DLLVM_ENABLE_ASSERTIONS=ON \
    -DCMAKE_BUILD_TYPE=RELEASE \
    -DMLIR_ENABLE_CUDA_RUNNER=ON DMLIR_INCLUDE_INTEGRATION_TESTS=ON

 cmake --build . --target check-mlir-integration

[0/1] Running lit suite /root/llvm/llvm-project/mlir/test/Integration

Testing Time: 0.24s

Total Discovered Tests: 340
  Unsupported: 340 (100.00%)

./llvm-project/build/bin/llvm-lit -v llvm-project/mlir/test/Integration/Dialect/Vector/CPU/ 

xxxx
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/ArmSME/test-setArmSVLBits.mlir (68 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/outerproduct-i64.mlir (69 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/AMX/tilezero.mlir (70 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/ArmSME/transfer-write-2d.mlir (71 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/ArmSVE/arrays-of-scalable-vectors.mlir (72 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/ArmSME/load-store-128-bit-tile.mlir (73 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/transfer-write.mlir (74 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/fma.mlir (75 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/ArmSVE/test-setArmVLBits.mlir (76 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/ArmSVE/scalable-interleave.mlir (77 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/compress.mlir (78 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/broadcast.mlir (79 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/X86Vector/dot.mlir (80 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/print-int.mlir (81 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/sparse-dot-matvec.mlir (82 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/shuffle.mlir (83 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/contraction.mlir (84 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/transpose.mlir (85 of 86)
UNSUPPORTED: MLIR :: Integration/Dialect/Vector/CPU/AMX/muli-full.mlir (86 of 86)

Testing Time: 0.14s

Total Discovered Tests: 86
  Unsupported: 86 (100.00%)

ftynse · 2025-01-27T15:10:12Z

-DMLIR_ENABLE_CUDA_RUNNER=ON DMLIR_INCLUDE_INTEGRATION_TESTS=ON

You are missing a dash before DMLIR_INCLUDE_INTEGRATION_TESTS, should be -DMLIR_INCLUDE_INTEGRATION_TESTS

dcaballe

LG. Would you mind adding support for poison indices? They are about to be introduced by #123488 and they would also fall into the "constant index" category.

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

mlir/test/Dialect/Vector/canonicalize.mlir

mlir/test/Integration/Dialect/Vector/CPU/extract-insert-fold-constant.mlir

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir

mlir/test/Dialect/Vector/canonicalize.mlir

banach-space

Thanks for the updates so far 🙏🏻 The folder itself looks good, but I am leaving some suggestions re testing.

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir

mlir/test/Dialect/Vector/canonicalize.mlir

linuxlonelyeagle · 2025-02-02T12:27:01Z

Ping: @banach-space Maybe you forgot about this PR, can you speed up this PR a bit? thank you.

banach-space · 2025-02-03T17:19:50Z

Ping: @banach-space Maybe you forgot about this PR, can you speed up this PR a bit? thank you.

Sorry about the delay, I have been travelling. Posting new round of reviews shortly.

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir

mlir/test/Dialect/Vector/canonicalize.mlir

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir

banach-space

This makes sense to me, thanks so much for seeing this through!

Please wait for @ftynse to unblock this and for @dcaballe and @Groverkss to take another look. Thank you 🙏🏻

linuxlonelyeagle · 2025-02-08T18:44:17Z

Ping @ftynse @Groverkss @dcaballe This PR still needs a final review from you, I re-eliminated the conflict.Thank you.

mlir/lib/Dialect/Vector/IR/VectorOps.cpp

… it to extractOp and insertOp. (llvm#124399) add extractInsertFoldConstantOp fold function and apply it to extractOp and insertOp.

linuxlonelyeagle requested review from banach-space, dcaballe, nicolasvasilache and Groverkss as code owners January 25, 2025 09:00

llvmbot added mlir:vectorops mlir mlir:vector labels Jan 25, 2025

linuxlonelyeagle requested review from krzysz00 and ftynse January 25, 2025 09:02

add foldConstantOp fold function and apply it to extractOp and insertOp.

b6b4362

linuxlonelyeagle force-pushed the advance-support-extract-insert branch from 3ff7e59 to b6b4362 Compare January 25, 2025 09:05

linuxlonelyeagle requested a review from grypp January 25, 2025 09:06

Groverkss reviewed Jan 25, 2025

View reviewed changes

mlir/lib/Dialect/Vector/IR/VectorOps.cpp Outdated Show resolved Hide resolved

Groverkss reviewed Jan 25, 2025

View reviewed changes

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir Outdated Show resolved Hide resolved

Groverkss reviewed Jan 25, 2025

View reviewed changes

mlir/lib/Dialect/Vector/IR/VectorOps.cpp Outdated Show resolved Hide resolved

mlir/lib/Dialect/Vector/IR/VectorOps.cpp Outdated Show resolved Hide resolved

add logic result and matchPattern on dynamic position.

2d77859

ftynse requested changes Jan 26, 2025

View reviewed changes

dcaballe reviewed Jan 28, 2025

View reviewed changes

mlir/lib/Dialect/Vector/IR/VectorOps.cpp Outdated Show resolved Hide resolved

update function name and add canonicalize test and regression test.

4b77e6b

Groverkss reviewed Jan 28, 2025

View reviewed changes

banach-space reviewed Jan 28, 2025

View reviewed changes

resolve conflicts and update test.

f6a113d

banach-space reviewed Jan 29, 2025

View reviewed changes

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir Outdated Show resolved Hide resolved

mlir/test/Dialect/Vector/canonicalize.mlir Outdated Show resolved Hide resolved

linuxlonelyeagle changed the title ~~[mlir][vector]add foldConstantOp fold function and apply it to extractOp and insertOp.~~ [mlir][vector]add extractInsertFoldConstantOp fold function and apply it to extractOp and insertOp. Jan 30, 2025

banach-space reviewed Feb 3, 2025

View reviewed changes

update test.

a2a5c93

linuxlonelyeagle force-pushed the advance-support-extract-insert branch from 47ff7c6 to a2a5c93 Compare February 4, 2025 06:25

banach-space reviewed Feb 4, 2025

View reviewed changes

mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir Outdated Show resolved Hide resolved

banach-space approved these changes Feb 4, 2025

View reviewed changes

linuxlonelyeagle added 2 commits February 5, 2025 01:49

update comment.

7a469d4

resolve conflicts.

fa4c826

dcaballe approved these changes Feb 9, 2025

View reviewed changes

ftynse approved these changes Feb 10, 2025

View reviewed changes

mlir/lib/Dialect/Vector/IR/VectorOps.cpp Outdated Show resolved Hide resolved

linuxlonelyeagle added 2 commits February 10, 2025 23:10

update comment.

4c3646f

Merge branch 'main' into advance-support-extract-insert

623d1a5

linuxlonelyeagle merged commit 7ae78a6 into llvm:main Feb 10, 2025
8 checks passed

[mlir][vector]add extractInsertFoldConstantOp fold function and apply it to extractOp and insertOp. #124399

[mlir][vector]add extractInsertFoldConstantOp fold function and apply it to extractOp and insertOp. #124399

Uh oh!

Conversation

linuxlonelyeagle commented Jan 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 25, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Groverkss left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

ftynse left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

linuxlonelyeagle commented Jan 26, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ftynse commented Jan 27, 2025

Uh oh!

dcaballe left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

banach-space left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

linuxlonelyeagle commented Feb 2, 2025

Uh oh!

banach-space commented Feb 3, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

banach-space left a comment

Choose a reason for hiding this comment

Uh oh!

linuxlonelyeagle commented Feb 8, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

linuxlonelyeagle commented Jan 25, 2025 •

edited

Loading

llvmbot commented Jan 25, 2025 •

edited

Loading

linuxlonelyeagle commented Jan 26, 2025 •

edited

Loading