[InstCombine] Implement vp.reverse reordering/elimination through binop/unop #143963

preames · 2025-06-12T20:22:38Z

This simply copies the structure of the vector.reverse patterns from just above, and reimplements them for the vp.reverse intrinsics when the mask is all ones and the EVLs exactly match.

Its unfortunate that we have three different ways to represent a reverse (shuffle, vector.reverse, and vp.reverse) but I don't see an obvious way to remove any them because the semantics are slightly different.

This significantly improves vectorization in TSVC_2's s112 and s1112 loops when using EVL tail folding.

This simply copies the structure of the vector.reverse patterns from just above, and reimplements them for the vp.reverse intrinsics when the masks and EVLs exactly match. Its unfortunate that we have three different ways to represent a reverse (shuffle, vector.reverse, and vp.reverse) but I don't see an obvious way to remove any them because the semantics are slightly different. This sigificantly improves vectorization in TSVC_2 s112 and s1112 loops when using EVL tail folding.

llvmbot · 2025-06-12T20:23:07Z

@llvm/pr-subscribers-llvm-transforms

Author: Philip Reames (preames)

Changes

This simply copies the structure of the vector.reverse patterns from just above, and reimplements them for the vp.reverse intrinsics when the masks and EVLs exactly match.

Its unfortunate that we have three different ways to represent a reverse (shuffle, vector.reverse, and vp.reverse) but I don't see an obvious way to remove any them because the semantics are slightly different.

This significantly improves vectorization in TSVC_2's s112 and s1112 loops when using EVL tail folding.

Full diff: https://github.com/llvm/llvm-project/pull/143963.diff

2 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp (+36)
(modified) llvm/test/Transforms/InstCombine/vp-reverse.ll (+10-22)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c169ab25b2106..ee7c9dbac2fa3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3582,6 +3582,42 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::experimental_vp_reverse: {
+    Value *BO0, *BO1, *X, *Y;
+    Value *Vec = II->getArgOperand(0);
+    Value *Mask = II->getArgOperand(1);
+    Value *EVL = II->getArgOperand(2);
+    auto m_VPReverse = [&](Value *&Vec) {
+      return m_Intrinsic<Intrinsic::experimental_vp_reverse>(
+          m_Value(Vec), m_Specific(Mask), m_Specific(EVL));
+    };
+    if (match(Vec, m_OneUse(m_BinOp(m_Value(BO0), m_Value(BO1))))) {
+      auto *OldBinOp = cast<BinaryOperator>(Vec);
+      if (match(BO0, m_VPReverse(X))) {
+        // rev(binop rev(X), rev(Y)) --> binop X, Y
+        if (match(BO1, m_VPReverse(Y)))
+          return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
+                                             OldBinOp->getOpcode(), X, Y,
+                                             OldBinOp, OldBinOp->getName(),
+                                             II->getIterator()));
+        // rev(binop rev(X), BO1Splat) --> binop X, BO1Splat
+        if (isSplatValue(BO1))
+          return replaceInstUsesWith(CI, BinaryOperator::CreateWithCopiedFlags(
+                                             OldBinOp->getOpcode(), X, BO1,
+                                             OldBinOp, OldBinOp->getName(),
+                                             II->getIterator()));
+      }
+    }
+    // rev(unop rev(X)) --> unop X
+    if (match(Vec, m_OneUse(m_UnOp(m_VPReverse(X))))) {
+      auto *OldUnOp = cast<UnaryOperator>(Vec);
+      auto *NewUnOp = UnaryOperator::CreateWithCopiedFlags(
+          OldUnOp->getOpcode(), X, OldUnOp, OldUnOp->getName(),
+          II->getIterator());
+      return replaceInstUsesWith(CI, NewUnOp);
+    }
+    break;
+  }
   case Intrinsic::vector_reduce_or:
   case Intrinsic::vector_reduce_and: {
     // Canonicalize logical or/and reductions:
diff --git a/llvm/test/Transforms/InstCombine/vp-reverse.ll b/llvm/test/Transforms/InstCombine/vp-reverse.ll
index 79e6c47bdf1b2..41b27be5f5248 100644
--- a/llvm/test/Transforms/InstCombine/vp-reverse.ll
+++ b/llvm/test/Transforms/InstCombine/vp-reverse.ll
@@ -3,11 +3,8 @@
 
 define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_elim(
-; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[B:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], [[B]]
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -18,11 +15,8 @@ define <vscale x 4 x i32> @binop_reverse_elim(<vscale x 4 x i32> %a, <vscale x 4
 
 define <vscale x 4 x i32> @binop_reverse_elim2(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_elim2(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i1> [[M:%.*]], i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV]], [[B_REV]]
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> [[M]], i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV:%.*]], [[B_REV:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> %m, i32 %evl)
   %b.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %b, <vscale x 4 x i1> %m, i32 %evl)
@@ -63,10 +57,8 @@ define <vscale x 4 x i32> @binop_reverse_elim_diffevl(<vscale x 4 x i32> %a, <vs
 
 define <vscale x 4 x i32> @binop_reverse_splat_elim(<vscale x 4 x i32> %a, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_splat_elim(
-; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], splat (i32 22)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %add = add nsw <vscale x 4 x i32> %a.rev, splat (i32 22)
@@ -76,10 +68,8 @@ define <vscale x 4 x i32> @binop_reverse_splat_elim(<vscale x 4 x i32> %a, i32 %
 
 define <vscale x 4 x i32> @binop_reverse_splat_elim2(<vscale x 4 x i32> %a, i32 %evl) {
 ; CHECK-LABEL: @binop_reverse_splat_elim2(
-; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[A1:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A]], splat (i32 22)
-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD_REV]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add nsw <vscale x 4 x i32> [[A:%.*]], splat (i32 22)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD1]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse(<vscale x 4 x i32> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %add = add nsw <vscale x 4 x i32> splat (i32 22), %a.rev
@@ -89,10 +79,8 @@ define <vscale x 4 x i32> @binop_reverse_splat_elim2(<vscale x 4 x i32> %a, i32
 
 define <vscale x 4 x float> @unop_reverse_splat_elim(<vscale x 4 x float> %a, <vscale x 4 x float> %b, i32 %evl) {
 ; CHECK-LABEL: @unop_reverse_splat_elim(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
-; CHECK-NEXT:    [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV]]
-; CHECK-NEXT:    [[OP_REV:%.*]] = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> [[OP]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[OP_REV]]
+; CHECK-NEXT:    [[OP:%.*]] = fneg <vscale x 4 x float> [[A_REV:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[OP]]
 ;
   %a.rev = tail call <vscale x 4 x float> @llvm.experimental.vp.reverse.nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   %op = fneg <vscale x 4 x float> %a.rev

topperc · 2025-06-12T23:58:44Z

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

+    Value *EVL = II->getArgOperand(2);
+    auto m_VPReverse = [&](Value *&Vec) {
+      return m_Intrinsic<Intrinsic::experimental_vp_reverse>(
+          m_Value(Vec), m_Specific(Mask), m_Specific(EVL));


The mask is kind of weird. The inner vp.reverse would poison any elements where the mask is 0. The mask of the outer vp.reverse would need to be the reverse of the inner mask to not consume the poison elements.

You're 100% right. The case I actually care about is a true mask, so I'm going to restrict it to that.

preames · 2025-06-13T16:24:55Z

See also #144112. Waiting on discussion in #143967 to converge, but I'm guessing that I'm going to end of reworking the approach on this patch, so review attention is probably better focused on the instsimplify patch for the moment.

preames · 2025-06-16T18:39:19Z

Now that #144112 has landed, I've reworked this patch for to generalize the binop handling as reordering the reverse after the binop. This is slightly more general.

preames · 2025-06-16T18:40:07Z

llvm/test/Transforms/InstCombine/vp-reverse.ll

-; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> splat (i1 true), i32 10)
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <vscale x 4 x i32> [[A_REV:%.*]], [[B_REV:%.*]]
+; CHECK-NEXT:    [[ADD1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]], <vscale x 4 x i1> splat (i1 true), i32 [[EVL:%.*]])
+; CHECK-NEXT:    [[ADD_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.vp.reverse.nxv4i32(<vscale x 4 x i32> [[ADD1]], <vscale x 4 x i1> splat (i1 true), i32 10)


I believe we could add a separate combine to take the min of the two EVLs here since all the other elements are poison. Possibly a followup.

Edit: nevermind, reversing the bottom 10 elements, and then reversing the bottom 5 of that isn't the same as the original bottom 5.

github-actions · 2025-06-16T18:40:59Z

✅ With the latest revision this PR passed the C/C++ code formatter.

topperc · 2025-06-17T00:28:06Z

Our downstream has a recursive canEvaluateVPReversed that recursively walks through multiple instructions to handle more complex cases. Unfortunately, it was written when all the arithmetic ops were vp intrinsics so it's not directly useful as is. Would you like me to share it anyway or work on porting it to non-VP?

preames · 2025-06-17T01:25:36Z

Our downstream has a recursive canEvaluateVPReversed that recursively walks through multiple instructions to handle more complex cases. Unfortunately, it was written when all the arithmetic ops were vp intrinsics so it's not directly useful as is. Would you like me to share it anyway or work on porting it to non-VP?

Happy to take a look, but to confirm, you're not intending that to be blocking for this patch right?

topperc · 2025-06-17T02:19:38Z

Our downstream has a recursive canEvaluateVPReversed that recursively walks through multiple instructions to handle more complex cases. Unfortunately, it was written when all the arithmetic ops were vp intrinsics so it's not directly useful as is. Would you like me to share it anyway or work on porting it to non-VP?

Happy to take a look, but to confirm, you're not intending that to be blocking for this patch right?

No. Not blocking for this patch.

dtcxzyw

LGTM

preames requested a review from dtcxzyw June 12, 2025 20:22

preames requested a review from nikic as a code owner June 12, 2025 20:22

llvmbot added llvm:instcombine Covers the InstCombine, InstSimplify and AggressiveInstCombine passes llvm:transforms labels Jun 12, 2025

topperc reviewed Jun 12, 2025

View reviewed changes

preames added 2 commits June 13, 2025 07:20

Address review comment

225ff0b

Add handling for rev(binop BO0Splat, rev(Y)) --> binop BO0Splat, Y

8ba4d6c

preames added 2 commits June 16, 2025 11:13

Merge branch 'main' into pr-instcombine-vp-reduce-elimination

d31b649

Rework as order canonicalization instead of elimination

098adb0

preames changed the title ~~[InstCombine] Implement vp.reverse elimination through binop/unop~~ [InstCombine] Implement vp.reverse reordering/elimination through binop/unop Jun 16, 2025

preames commented Jun 16, 2025

View reviewed changes

clang-format

667b960

dtcxzyw approved these changes Jun 18, 2025

View reviewed changes

preames merged commit b5aaf9d into llvm:main Jun 18, 2025
7 checks passed

preames deleted the pr-instcombine-vp-reduce-elimination branch June 18, 2025 15:53

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[InstCombine] Implement vp.reverse reordering/elimination through binop/unop #143963

[InstCombine] Implement vp.reverse reordering/elimination through binop/unop #143963

Uh oh!

preames commented Jun 12, 2025 •

edited

Loading

Uh oh!

llvmbot commented Jun 12, 2025

Uh oh!

topperc Jun 12, 2025

Uh oh!

preames Jun 13, 2025

Uh oh!

preames commented Jun 13, 2025

Uh oh!

preames commented Jun 16, 2025

Uh oh!

preames Jun 16, 2025 •

edited

Loading

Uh oh!

github-actions bot commented Jun 16, 2025 •

edited

Loading

Uh oh!

topperc commented Jun 17, 2025

Uh oh!

preames commented Jun 17, 2025

Uh oh!

topperc commented Jun 17, 2025

Uh oh!

dtcxzyw left a comment

Uh oh!

Uh oh!

Uh oh!

[InstCombine] Implement vp.reverse reordering/elimination through binop/unop #143963

[InstCombine] Implement vp.reverse reordering/elimination through binop/unop #143963

Uh oh!

Conversation

preames commented Jun 12, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jun 12, 2025

Uh oh!

topperc Jun 12, 2025

Choose a reason for hiding this comment

Uh oh!

preames Jun 13, 2025

Choose a reason for hiding this comment

Uh oh!

preames commented Jun 13, 2025

Uh oh!

preames commented Jun 16, 2025

Uh oh!

preames Jun 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Jun 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

topperc commented Jun 17, 2025

Uh oh!

preames commented Jun 17, 2025

Uh oh!

topperc commented Jun 17, 2025

Uh oh!

dtcxzyw left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

preames commented Jun 12, 2025 •

edited

Loading

preames Jun 16, 2025 •

edited

Loading

github-actions bot commented Jun 16, 2025 •

edited

Loading