[SLP]Fix the cost of the adjusted extracts in per-register analysis. #96808

alexey-bataev · 2024-06-26T18:26:31Z

Previous patch did not pass the list of the extract indices by
reference, so the compiler just ignored them. Pass indices by reference
and fix the per-register analysis.

Created using spr 1.3.5

llvmbot · 2024-06-26T18:27:00Z

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Previous patch did not pass the list of the extract indices by
reference, so the compiler just ignored them. Pass indices by reference
and fix the per-register analysis.

Patch is 31.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/96808.diff

4 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+46-15)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll (+84-60)
(modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-calls-inseltpoison.ll (+15-13)
(modified) llvm/test/Transforms/SLPVectorizer/X86/alternate-calls.ll (+15-13)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 974f966d46e81..5d6ddb787ac3c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -8304,20 +8304,31 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         });
     // FIXME: this must be moved to TTI for better estimation.
     unsigned EltsPerVector = getPartNumElems(VL.size(), NumParts);
-    auto CheckPerRegistersShuffle =
-        [&](MutableArrayRef<int> Mask,
-            SmallVector<int> Indices) -> std::optional<TTI::ShuffleKind> {
+    auto CheckPerRegistersShuffle = [&](MutableArrayRef<int> Mask,
+                                        SmallVectorImpl<unsigned> &Indices)
+        -> std::optional<TTI::ShuffleKind> {
       if (NumElts <= EltsPerVector)
         return std::nullopt;
+      int OffsetReg0 =
+          alignDown(std::accumulate(Mask.begin(), Mask.end(), INT_MAX,
+                                    [](int S, int I) {
+                                      if (I == PoisonMaskElem)
+                                        return S;
+                                      return std::min(S, I);
+                                    }),
+                    EltsPerVector);
+      int OffsetReg1 = OffsetReg0;
       DenseSet<int> RegIndices;
       // Check that if trying to permute same single/2 input vectors.
       TTI::ShuffleKind ShuffleKind = TTI::SK_PermuteSingleSrc;
       int FirstRegId = -1;
-      Indices.assign(1, -1);
-      for (int &I : Mask) {
+      Indices.assign(1, OffsetReg0);
+      for (auto [Pos, I] : enumerate(Mask)) {
         if (I == PoisonMaskElem)
           continue;
-        int RegId = (I / NumElts) * NumParts + (I % NumElts) / EltsPerVector;
+        int Idx = I - OffsetReg0;
+        int RegId =
+            (Idx / NumElts) * NumParts + (Idx % NumElts) / EltsPerVector;
         if (FirstRegId < 0)
           FirstRegId = RegId;
         RegIndices.insert(RegId);
@@ -8325,14 +8336,25 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
           return std::nullopt;
         if (RegIndices.size() == 2) {
           ShuffleKind = TTI::SK_PermuteTwoSrc;
-          if (Indices.size() == 1)
-            Indices.push_back(-1);
+          if (Indices.size() == 1) {
+            OffsetReg1 = alignDown(
+                std::accumulate(
+                    std::next(Mask.begin(), Pos), Mask.end(), INT_MAX,
+                    [&](int S, int I) {
+                      if (I == PoisonMaskElem)
+                        return S;
+                      int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
+                                  ((I - OffsetReg0) % NumElts) / EltsPerVector;
+                      if (RegId == FirstRegId)
+                        return S;
+                      return std::min(S, I);
+                    }),
+                EltsPerVector);
+            Indices.push_back(OffsetReg1);
+          }
+          Idx = I - OffsetReg1;
         }
-        if (RegId == FirstRegId)
-          Indices.front() = I % NumElts;
-        else
-          Indices.back() = I % NumElts;
-        I = (I % NumElts) % EltsPerVector +
+        I = (Idx % NumElts) % EltsPerVector +
             (RegId == FirstRegId ? 0 : EltsPerVector);
       }
       return ShuffleKind;
@@ -8349,7 +8371,7 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
           Part * EltsPerVector, getNumElems(Mask.size(), EltsPerVector, Part));
       SmallVector<int> SubMask(EltsPerVector, PoisonMaskElem);
       copy(MaskSlice, SubMask.begin());
-      SmallVector<int> Indices;
+      SmallVector<unsigned, 2> Indices;
       std::optional<TTI::ShuffleKind> RegShuffleKind =
           CheckPerRegistersShuffle(SubMask, Indices);
       if (!RegShuffleKind) {
@@ -8367,12 +8389,21 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
                                  FixedVectorType::get(ScalarTy, EltsPerVector),
                                  SubMask);
       }
-      for (int Idx : Indices) {
+      for (unsigned Idx : Indices) {
         Cost += ::getShuffleCost(TTI, TTI::SK_ExtractSubvector,
                                  FixedVectorType::get(ScalarTy, NumElts),
                                  std::nullopt, CostKind, Idx,
                                  FixedVectorType::get(ScalarTy, EltsPerVector));
       }
+      // Second attempt to check, if just a permute is better estimated than
+      // subvector extract.
+      SubMask.assign(NumElts, PoisonMaskElem);
+      copy(MaskSlice, SubMask.begin());
+      InstructionCost OriginalCost =
+          ::getShuffleCost(TTI, *ShuffleKinds[Part],
+                           FixedVectorType::get(ScalarTy, NumElts), SubMask);
+      if (OriginalCost < Cost)
+        Cost = OriginalCost;
     }
     return Cost;
   }
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
index 9608608a18098..059e4c38b519b 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/math-function.ll
@@ -155,11 +155,13 @@ define <4 x float> @exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -171,11 +173,13 @@ define <4 x float> @exp_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @expf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @expf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @expf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -208,11 +212,13 @@ define <4 x float> @int_exp_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_exp_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -224,11 +230,13 @@ define <4 x float> @int_exp_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.exp.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.exp.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -261,11 +269,13 @@ define <4 x float> @log_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -277,11 +287,13 @@ define <4 x float> @log_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @logf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @logf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @logf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -314,11 +326,13 @@ define <4 x float> @int_log_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @int_log_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -330,11 +344,13 @@ define <4 x float> @int_log_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.log.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.log.f32(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -367,11 +383,13 @@ define <4 x float> @sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; CHECK-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; CHECK-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:    ret <4 x float> [[VECINS_31]]
+; CHECK-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; CHECK-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; CHECK-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; CHECK-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; CHECK-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 ; DEFAULT-LABEL: define <4 x float> @sin_4x
 ; DEFAULT-SAME: (ptr [[A:%.*]]) #[[ATTR1]] {
@@ -383,11 +401,13 @@ define <4 x float> @sin_4x(ptr %a) {
 ; DEFAULT-NEXT:    [[VECEXT_1:%.*]] = extractelement <4 x float> [[TMP0]], i32 1
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = tail call fast float @sinf(float [[VECEXT_1]])
 ; DEFAULT-NEXT:    [[VECINS_1:%.*]] = insertelement <4 x float> [[VECINS]], float [[TMP2]], i32 1
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <2 x i32> <i32 2, i32 3>
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call fast <2 x float> @llvm.sin.v2f32(<2 x float> [[TMP3]])
-; DEFAULT-NEXT:    [[TMP5:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; DEFAULT-NEXT:    [[VECINS_31:%.*]] = shufflevector <4 x float> [[VECINS_1]], <4 x float> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; DEFAULT-NEXT:    ret <4 x float> [[VECINS_31]]
+; DEFAULT-NEXT:    [[VECEXT_2:%.*]] = extractelement <4 x float> [[TMP0]], i32 2
+; DEFAULT-NEXT:    [[TMP3:%.*]] = tail call fast float @sinf(float [[VECEXT_2]])
+; DEFAULT-NEXT:    [[VECINS_2:%.*]] = insertelement <4 x float> [[VECINS_1]], float [[TMP3]], i32 2
+; DEFAULT-NEXT:    [[VECEXT_3:%.*]] = extractelement <4 x float> [[TMP0]], i32 3
+; DEFAULT-NEXT:    [[TMP4:%.*]] = tail call fast float @sinf(float [[VECEXT_3]])
+; DEFAULT-NEXT:    [[VECINS_3:%.*]] = insertelement <4 x float> [[VECINS_2]], float [[TMP4]], i32 3
+; DEFAULT-NEXT:    ret <4 x float> [[VECINS_3]]
 ;
 entry:
   %0 = load <4 x float>, ptr %a, align 16
@@ -420,11 +440,13 @@ define <4 x float> @int_sin_4x(ptr %a) {
 ; CHECK-NEXT:    [[VECEXT_1:%.*]] = e...
[truncated]

RKSimon · 2024-06-28T09:45:53Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+                      if (I == PoisonMaskElem)
+                        return S;
+                      int RegId = ((I - OffsetReg0) / NumElts) * NumParts +
+                                  ((I - OffsetReg0) % NumElts) / EltsPerVector;


What is this doing? Don't we just need the PartIdx?

We analyze the remaining part of the mask, trying to find the indices from the second register. But we still may have the indices from the first register. So, it calculates the register id for the remaining indices to find the minimal index only(!) from this second register.

RKSimon

LGTM - cheers

Previous patch did not pass the list of the extract indices by reference, so the compiler just ignored them. Pass indices by reference and fix the per-register analysis. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: #96808

Previous patch did not pass the list of the extract indices by reference, so the compiler just ignored them. Pass indices by reference and fix the per-register analysis. Reviewers: RKSimon Reviewed By: RKSimon Pull Request: llvm#96808

[𝘀𝗽𝗿] initial version

f4e2c8a

Created using spr 1.3.5

llvmbot added vectorizers llvm:transforms labels Jun 26, 2024

alexey-bataev requested a review from RKSimon June 26, 2024 18:27

RKSimon reviewed Jun 28, 2024

View reviewed changes

RKSimon approved these changes Jun 28, 2024

View reviewed changes

alexey-bataev merged commit 7841520 into main Jun 28, 2024
8 of 10 checks passed

alexey-bataev deleted the users/alexey-bataev/spr/slpfix-the-cost-of-the-adjusted-extracts-in-per-register-analysis branch June 28, 2024 19:49

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[SLP]Fix the cost of the adjusted extracts in per-register analysis. #96808

[SLP]Fix the cost of the adjusted extracts in per-register analysis. #96808

Uh oh!

alexey-bataev commented Jun 26, 2024

Uh oh!

llvmbot commented Jun 26, 2024

Uh oh!

RKSimon Jun 28, 2024

Uh oh!

alexey-bataev Jun 28, 2024

Uh oh!

RKSimon left a comment

Uh oh!

Uh oh!

Uh oh!

[SLP]Fix the cost of the adjusted extracts in per-register analysis. #96808

[SLP]Fix the cost of the adjusted extracts in per-register analysis. #96808

Uh oh!

Conversation

alexey-bataev commented Jun 26, 2024

Uh oh!

llvmbot commented Jun 26, 2024

Uh oh!

RKSimon Jun 28, 2024

Choose a reason for hiding this comment

Uh oh!

alexey-bataev Jun 28, 2024

Choose a reason for hiding this comment

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!