[InstCombine] Pull vector reverse through intrinsics

lukel97 · lukel97 · commit e9ab4347982f · 2025-06-30T17:27:41.000+01:00
This is the intrinsic version of #146349, and handles fabs as well as other intrinsics. It's largely a copy of InstCombinerImpl::foldShuffledIntrinsicOperands but a bit simpler since we don't need to find a common mask. Creating a separate function seems to be cleaner than trying to shoehorn it into the existing one.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1456,6 +1456,45 @@ InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {
   return new ShuffleVectorInst(NewIntrinsic, Mask);
 }
 
+/// If all arguments of the intrinsic are reverses, try to pull the reverse
+/// after the intrinsic.
+Value *InstCombinerImpl::foldReversedIntrinsicOperands(IntrinsicInst *II) {
+  if (!isTriviallyVectorizable(II->getIntrinsicID()) ||
+      !II->getCalledFunction()->isSpeculatable())
+    return nullptr;
+
+  // At least 1 operand must be a reverse with 1 use because we are creating 2
+  // instructions.
+  if (none_of(II->args(), [](Value *V) {
+        return match(V, m_OneUse(m_VecReverse(m_Value())));
+      }))
+    return nullptr;
+
+  Value *X;
+  Constant *C;
+  SmallVector<Value *> NewArgs;
+  for (Use &Arg : II->args()) {
+    if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
+                                           Arg.getOperandNo(), nullptr))
+      NewArgs.push_back(Arg);
+    else if (match(&Arg, m_VecReverse(m_Value(X))))
+      NewArgs.push_back(X);
+    else if (Value *Splat = getSplatValue(Arg))
+      NewArgs.push_back(Builder.CreateVectorSplat(
+          cast<VectorType>(Arg->getType())->getElementCount(), Splat));
+    else if (match(&Arg, m_ImmConstant(C)))
+      NewArgs.push_back(Builder.CreateVectorReverse(C));
+    else
+      return nullptr;
+  }
+
+  // intrinsic (reverse X), (reverse Y), ... --> reverse (intrinsic X, Y, ...)
+  Instruction *FPI = isa<FPMathOperator>(II) ? II : nullptr;
+  Instruction *NewIntrinsic = Builder.CreateIntrinsic(
+      II->getType(), II->getIntrinsicID(), NewArgs, FPI);
+  return Builder.CreateVectorReverse(NewIntrinsic);
+}
+
 /// Fold the following cases and accepts bswap and bitreverse intrinsics:
 ///   bswap(logic_op(bswap(x), y)) --> logic_op(x, bswap(y))
 ///   bswap(logic_op(bswap(x), bswap(y))) --> logic_op(x, y) (ignores multiuse)
@@ -3867,6 +3906,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
   if (Instruction *Shuf = foldShuffledIntrinsicOperands(II))
     return Shuf;
 
+  if (Value *Reverse = foldReversedIntrinsicOperands(II))
+    return replaceInstUsesWith(*II, Reverse);
+
   // Some intrinsics (like experimental_gc_statepoint) can be used in invoke
   // context, so it is handled in visitCallBase and we should trigger it.
   return visitCallBase(*II);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -148,6 +148,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   Instruction *foldItoFPtoI(CastInst &FI);
   Instruction *visitSelectInst(SelectInst &SI);
   Instruction *foldShuffledIntrinsicOperands(IntrinsicInst *II);
+  Value *foldReversedIntrinsicOperands(IntrinsicInst *II);
   Instruction *visitCallInst(CallInst &CI);
   Instruction *visitInvokeInst(InvokeInst &II);
   Instruction *visitCallBrInst(CallBrInst &CBI);
diff --git a/llvm/test/Transforms/InstCombine/vector-reverse.ll b/llvm/test/Transforms/InstCombine/vector-reverse.ll
@@ -19,10 +19,9 @@ define <vscale x 4 x i32> @binop_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i3
 
 define <vscale x 4 x i32> @binop_intrinsic_reverse(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_intrinsic_reverse(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> [[A_REV]], <vscale x 4 x i32> [[B_REV]])
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> [[A_REV:%.*]], <vscale x 4 x i32> [[B_REV:%.*]])
+; CHECK-NEXT:    [[SMAX:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[ADD]])
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[SMAX]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
   %b.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %b)
@@ -49,10 +48,10 @@ define <vscale x 4 x i32> @binop_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x
 ; %a.rev has multiple uses
 define <vscale x 4 x i32> @binop_intrinsic_reverse_1(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b) {
 ; CHECK-LABEL: @binop_intrinsic_reverse_1(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[A:%.*]])
 ; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[B:%.*]])
-; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[A_REV]])
-; CHECK-NEXT:    [[SMAX:%.*]] = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> [[A_REV]], <vscale x 4 x i32> [[B_REV]])
+; CHECK-NEXT:    call void @use_nxv4i32(<vscale x 4 x i32> [[B_REV]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.smax.nxv4i32(<vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[B1:%.*]])
+; CHECK-NEXT:    [[SMAX:%.*]] = call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP1]])
 ; CHECK-NEXT:    ret <vscale x 4 x i32> [[SMAX]]
 ;
   %a.rev = tail call <vscale x 4 x i32> @llvm.vector.reverse.nxv4i32(<vscale x 4 x i32> %a)
@@ -233,9 +232,9 @@ define <vscale x 4 x float> @unop_reverse_1(<vscale x 4 x float> %a) {
 
 define <vscale x 4 x float> @unop_intrinsic_reverse(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: @unop_intrinsic_reverse(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
-; CHECK-NEXT:    [[NEG:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[A_REV]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[NEG]]
+; CHECK-NEXT:    [[NEG:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[A_REV:%.*]])
+; CHECK-NEXT:    [[ABS:%.*]] = call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[NEG]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[ABS]]
 ;
   %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %abs = call <vscale x 4 x float> @llvm.fabs(<vscale x 4 x float> %a.rev)
@@ -689,11 +688,8 @@ define <vscale x 4 x float> @reverse_binop_reverse(<vscale x 4 x float> %a, <vsc
 
 define <vscale x 4 x float> @reverse_binop_intrinsic_reverse(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
 ; CHECK-LABEL: @reverse_binop_intrinsic_reverse(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
-; CHECK-NEXT:    [[B_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[B:%.*]])
-; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[A_REV]], <vscale x 4 x float> [[B_REV]])
-; CHECK-NEXT:    [[MAXNUM_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[ADD]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[MAXNUM_REV]]
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[A_REV:%.*]], <vscale x 4 x float> [[B_REV:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[ADD]]
 ;
   %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %b.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %b)
@@ -734,12 +730,10 @@ define <vscale x 4 x float> @reverse_binop_reverse_splat_LHS(<vscale x 4 x float
 
 define <vscale x 4 x float> @reverse_binop_reverse_intrinsic_splat_RHS(<vscale x 4 x float> %a, float %b) {
 ; CHECK-LABEL: @reverse_binop_reverse_intrinsic_splat_RHS(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[B_INSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[MAXNUM:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[A_REV]], <vscale x 4 x float> [[B_SPLAT]])
-; CHECK-NEXT:    [[MAXNUM_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[MAXNUM]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[MAXNUM_REV]]
+; CHECK-NEXT:    [[MAXNUM:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[A_REV:%.*]], <vscale x 4 x float> [[B_SPLAT]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[MAXNUM]]
 ;
   %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %b.insert = insertelement <vscale x 4 x float> poison, float %b, i32 0
@@ -751,12 +745,10 @@ define <vscale x 4 x float> @reverse_binop_reverse_intrinsic_splat_RHS(<vscale x
 
 define <vscale x 4 x float> @reverse_binop_reverse_intrinsic_splat_LHS(<vscale x 4 x float> %a, float %b) {
 ; CHECK-LABEL: @reverse_binop_reverse_intrinsic_splat_LHS(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
 ; CHECK-NEXT:    [[B_INSERT:%.*]] = insertelement <vscale x 4 x float> poison, float [[B:%.*]], i64 0
 ; CHECK-NEXT:    [[B_SPLAT:%.*]] = shufflevector <vscale x 4 x float> [[B_INSERT]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-NEXT:    [[MAXNUM:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[B_SPLAT]], <vscale x 4 x float> [[A_REV]])
-; CHECK-NEXT:    [[MAXNUM_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[MAXNUM]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[MAXNUM_REV]]
+; CHECK-NEXT:    [[MAXNUM:%.*]] = call <vscale x 4 x float> @llvm.maxnum.nxv4f32(<vscale x 4 x float> [[B_SPLAT]], <vscale x 4 x float> [[A_REV:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[MAXNUM]]
 ;
   %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %b.insert = insertelement <vscale x 4 x float> poison, float %b, i32 0
@@ -768,8 +760,8 @@ define <vscale x 4 x float> @reverse_binop_reverse_intrinsic_splat_LHS(<vscale x
 
 define <4 x float> @reverse_binop_reverse_intrinsic_constant_RHS(<4 x float> %a) {
 ; CHECK-LABEL: @reverse_binop_reverse_intrinsic_constant_RHS(
-; CHECK-NEXT:    [[MAXNUM_REV1:%.*]] = tail call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> [[MAXNUM1:%.*]])
-; CHECK-NEXT:    [[MAXNUM:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[MAXNUM_REV1]], <4 x float> <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[A:%.*]], <4 x float> <float 3.000000e+00, float 2.000000e+00, float 1.000000e+00, float 0.000000e+00>)
+; CHECK-NEXT:    [[MAXNUM:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[MAXNUM_REV:%.*]] = tail call <4 x float> @llvm.vector.reverse.v4f32(<4 x float> [[MAXNUM]])
 ; CHECK-NEXT:    ret <4 x float> [[MAXNUM_REV]]
 ;
@@ -817,10 +809,8 @@ define <vscale x 4 x float> @reverse_unop_reverse(<vscale x 4 x float> %a) {
 
 define <vscale x 4 x float> @reverse_unop_intrinsic_reverse(<vscale x 4 x float> %a) {
 ; CHECK-LABEL: @reverse_unop_intrinsic_reverse(
-; CHECK-NEXT:    [[A_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A:%.*]])
-; CHECK-NEXT:    [[ABS:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[A_REV]])
-; CHECK-NEXT:    [[ABS_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[ABS]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[ABS_REV]]
+; CHECK-NEXT:    [[ABS:%.*]] = call <vscale x 4 x float> @llvm.fabs.nxv4f32(<vscale x 4 x float> [[A_REV:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[ABS]]
 ;
   %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %abs = call <vscale x 4 x float> @llvm.fabs(<vscale x 4 x float> %a.rev)
@@ -830,10 +820,8 @@ define <vscale x 4 x float> @reverse_unop_intrinsic_reverse(<vscale x 4 x float>
 
 define <vscale x 4 x float> @reverse_unop_intrinsic_reverse_scalar_arg(<vscale x 4 x float> %a, i32 %power) {
 ; CHECK-LABEL: @reverse_unop_intrinsic_reverse_scalar_arg(
-; CHECK-NEXT:    [[A:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[A1:%.*]])
-; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> [[A]], i32 [[POWER:%.*]])
-; CHECK-NEXT:    [[POWI_REV:%.*]] = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> [[TMP1]])
-; CHECK-NEXT:    ret <vscale x 4 x float> [[POWI_REV]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x float> @llvm.powi.nxv4f32.i32(<vscale x 4 x float> [[A:%.*]], i32 [[POWER:%.*]])
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP1]]
 ;
   %a.rev = tail call <vscale x 4 x float> @llvm.vector.reverse.nxv4f32(<vscale x 4 x float> %a)
   %powi = call <vscale x 4 x float> @llvm.powi.nxv4f32(<vscale x 4 x float> %a.rev, i32 %power)