[VectorCombine] convert scalar fneg with insert/extract to vector fneg

rotateright · rotateright · commit baab4aa1ba5f · 2022-10-10T14:59:56.000-04:00
insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index --> shuffle DestVec, (fneg SrcVec), Mask This is a specialized form of what could be a more general fold for a binop. It's also possible that fneg is overlooked by SLP in this kind of insert/extract pattern since it's a unary op. This shows up in the motivating example from #issue 58139, but it won't solve it (that probably requires some x86-specific backend changes). There are also some small enhancements (see TODO comments) that can be done as follow-up patches. Differential Revision: https://reviews.llvm.org/D135278
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Vectorize.h"
+#include <numeric>
 
 #define DEBUG_TYPE "vector-combine"
 #include "llvm/Transforms/Utils/InstructionWorklist.h"
@@ -97,6 +98,7 @@ class VectorCombine {
   void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
                        Instruction &I);
   bool foldExtractExtract(Instruction &I);
+  bool foldInsExtFNeg(Instruction &I);
   bool foldBitcastShuf(Instruction &I);
   bool scalarizeBinopOrCmp(Instruction &I);
   bool foldExtractedCmps(Instruction &I);
@@ -533,6 +535,67 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
   return true;
 }
 
+/// Try to replace an extract + scalar fneg + insert with a vector fneg +
+/// shuffle.
+bool VectorCombine::foldInsExtFNeg(Instruction &I) {
+  auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+  if (!VecTy)
+    return false;
+
+  // Match an insert (op (extract)) pattern.
+  Value *DestVec;
+  uint64_t Index;
+  Instruction *FNeg;
+  if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
+                             m_ConstantInt(Index))))
+    return false;
+
+  Value *SrcVec;
+  if (!match(FNeg, m_FNeg(m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index)))))
+    return false;
+
+  if (SrcVec->getType() != VecTy)
+    return false;
+
+  // Ignore bogus insert/extract index.
+  unsigned NumElts = VecTy->getNumElements();
+  if (Index >= NumElts)
+    return false;
+
+  // We are inserting the negated element into the same lane that we extracted
+  // from. This is equivalent to a select-shuffle that chooses all but the
+  // negated element from the destination vector.
+  SmallVector<int> Mask(NumElts);
+  std::iota(Mask.begin(), Mask.end(), 0);
+  Mask[Index] = Index + NumElts;
+
+  Type *ScalarTy = VecTy->getScalarType();
+  InstructionCost OldCost =
+      TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy) +
+      TTI.getVectorInstrCost(I, VecTy, Index);
+
+  // If the extract has one use, it will be eliminated, so count it in the
+  // original cost. If it has more than one use, ignore the cost because it will
+  // be the same before/after.
+  Instruction *Extract = cast<Instruction>(FNeg->getOperand(0));
+  if (Extract->hasOneUse())
+    OldCost += TTI.getVectorInstrCost(*Extract, VecTy, Index);
+
+  InstructionCost NewCost =
+      TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy) +
+      TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);
+
+  if (NewCost > OldCost)
+    return false;
+
+  // insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index -->
+  // shuffle DestVec, (fneg SrcVec), Mask
+  Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
+  Value *Shuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
+  replaceValue(I, *Shuf);
+  return true;
+}
+
 /// If this is a bitcast of a shuffle, try to bitcast the source vector to the
 /// destination type followed by shuffle. This can enable further transforms by
 /// moving bitcasts or shuffles together.
@@ -1571,6 +1634,7 @@ bool VectorCombine::run() {
     if (!ScalarizationOnly) {
       MadeChange |= vectorizeLoadInsert(I);
       MadeChange |= foldExtractExtract(I);
+      MadeChange |= foldInsExtFNeg(I);
       MadeChange |= foldBitcastShuf(I);
       MadeChange |= foldExtractedCmps(I);
       MadeChange |= foldShuffleOfBinops(I);
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll b/llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
@@ -1,9 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK
-; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=SSE2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mattr=AVX2 | FileCheck %s --check-prefixes=CHECK,AVX
 
 declare void @use(float)
 
+; TODO: The insert is costed as free, so creating a shuffle appears to be a loss.
+
 define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: @ext0_v4f32(
 ; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 0
@@ -21,9 +23,8 @@ define <4 x float> @ext0_v4f32(<4 x float> %x, <4 x float> %y) {
 
 define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: @ext2_v4f32(
-; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 2
-; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 2
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
 ; CHECK-NEXT:    ret <4 x float> [[R]]
 ;
   %e = extractelement <4 x float> %x, i32 2
@@ -36,9 +37,8 @@ define <4 x float> @ext2_v4f32(<4 x float> %x, <4 x float> %y) {
 
 define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @ext1_v2f64(
-; CHECK-NEXT:    [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
-; CHECK-NEXT:    [[N:%.*]] = fneg nsz double [[E]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
+; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 0, i32 3>
 ; CHECK-NEXT:    ret <2 x double> [[R]]
 ;
   %e = extractelement <2 x double> %x, i32 1
@@ -47,26 +47,43 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %r
 }
 
+; The vector fneg would cost twice as much as the scalar op with SSE,
+; so we don't transform there (the shuffle would also be more expensive).
+
 define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
-; CHECK-LABEL: @ext7_v8f32(
-; CHECK-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
-; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
-; CHECK-NEXT:    ret <8 x float> [[R]]
+; SSE-LABEL: @ext7_v8f32(
+; SSE-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
+; SSE-NEXT:    [[N:%.*]] = fneg float [[E]]
+; SSE-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
+; SSE-NEXT:    ret <8 x float> [[R]]
+;
+; AVX-LABEL: @ext7_v8f32(
+; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
+; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 15>
+; AVX-NEXT:    ret <8 x float> [[R]]
 ;
   %e = extractelement <8 x float> %x, i32 7
   %n = fneg float %e
   %r = insertelement <8 x float> %y, float %n, i32 7
   ret <8 x float> %r
 }
 
+; Same as above with an extra use of the extracted element.
+
 define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
-; CHECK-LABEL: @ext7_v8f32_use1(
-; CHECK-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5
-; CHECK-NEXT:    call void @use(float [[E]])
-; CHECK-NEXT:    [[N:%.*]] = fneg float [[E]]
-; CHECK-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 5
-; CHECK-NEXT:    ret <8 x float> [[R]]
+; SSE-LABEL: @ext7_v8f32_use1(
+; SSE-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5
+; SSE-NEXT:    call void @use(float [[E]])
+; SSE-NEXT:    [[N:%.*]] = fneg float [[E]]
+; SSE-NEXT:    [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 5
+; SSE-NEXT:    ret <8 x float> [[R]]
+;
+; AVX-LABEL: @ext7_v8f32_use1(
+; AVX-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 5
+; AVX-NEXT:    call void @use(float [[E]])
+; AVX-NEXT:    [[TMP1:%.*]] = fneg <8 x float> [[X]]
+; AVX-NEXT:    [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 13, i32 6, i32 7>
+; AVX-NEXT:    ret <8 x float> [[R]]
 ;
   %e = extractelement <8 x float> %x, i32 5
   call void @use(float %e)
@@ -75,6 +92,8 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
   ret <8 x float> %r
 }
 
+; Negative test - the transform is likely not profitable if the fneg has another use.
+
 define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
 ; CHECK-LABEL: @ext7_v8f32_use2(
 ; CHECK-NEXT:    [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 3
@@ -90,6 +109,8 @@ define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
   ret <8 x float> %r
 }
 
+; Negative test - can't convert variable index to a shuffle.
+
 define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) {
 ; CHECK-LABEL: @ext_index_var_v2f64(
 ; CHECK-NEXT:    [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 [[INDEX:%.*]]
@@ -103,6 +124,9 @@ define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %
   ret <2 x double> %r
 }
 
+; Negative test - require same extract/insert index for simple shuffle.
+; TODO: We could handle this by adjusting the cost calculation.
+
 define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @ext1_v2f64_ins0(
 ; CHECK-NEXT:    [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
@@ -116,6 +140,8 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %r
 }
 
+; Negative test - avoid changing poison ops
+
 define <4 x float> @ext12_v4f32(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: @ext12_v4f32(
 ; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 12