Skip to content

[SLP][REVEC] Fix cost model for getGatherCost with FixedVectorType ScalarTy. #109369

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Sep 24, 2024

Conversation

HanKuanChen
Copy link
Contributor

No description provided.

@llvmbot
Copy link
Member

llvmbot commented Sep 20, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Han-Kuan Chen (HanKuanChen)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/109369.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+22-10)
  • (added) llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll (+31)
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ddeb97463281c9..490d40bb182d34 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -11732,8 +11732,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
   // Find the cost of inserting/extracting values from the vector.
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
-  unsigned ScalarTyNumElements = getNumElements(ScalarTy);
-  APInt ShuffledElements = APInt::getZero(VecTy->getNumElements());
+  APInt ShuffledElements = APInt::getZero(VL.size());
   DenseMap<Value *, unsigned> UniqueElements;
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost Cost;
@@ -11753,8 +11752,7 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
     Value *V = VL[I];
     // No need to shuffle duplicates for constants.
     if ((ForPoisonSrc && isConstant(V)) || isa<UndefValue>(V)) {
-      ShuffledElements.setBits(I * ScalarTyNumElements,
-                               I * ScalarTyNumElements + ScalarTyNumElements);
+      ShuffledElements.setBit(I);
       ShuffleMask[I] = isa<PoisonValue>(V) ? PoisonMaskElem : I;
       continue;
     }
@@ -11767,14 +11765,28 @@ InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL, bool ForPoisonSrc,
     }
 
     DuplicateNonConst = true;
-    ShuffledElements.setBits(I * ScalarTyNumElements,
-                             I * ScalarTyNumElements + ScalarTyNumElements);
+    ShuffledElements.setBit(I);
     ShuffleMask[I] = Res.first->second;
   }
-  if (ForPoisonSrc)
-    Cost =
-        TTI->getScalarizationOverhead(VecTy, ~ShuffledElements, /*Insert*/ true,
-                                      /*Extract*/ false, CostKind);
+  if (ForPoisonSrc) {
+    if (isa<FixedVectorType>(ScalarTy)) {
+      assert(SLPReVec && "Only supported by REVEC.");
+      // We don't need to insert elements one by one. Instead, we can insert the
+      // entire vector into the destination.
+      Cost = 0;
+      unsigned ScalarTyNumElements = getNumElements(ScalarTy);
+      for (unsigned I = 0, E = VL.size(); I != E; ++I)
+        if (!ShuffledElements[I])
+          Cost += TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy,
+                                      std::nullopt, TTI::TCK_RecipThroughput,
+                                      I * ScalarTyNumElements,
+                                      cast<FixedVectorType>(ScalarTy));
+    } else {
+      Cost = TTI->getScalarizationOverhead(VecTy, ~ShuffledElements,
+                                           /*Insert*/ true,
+                                           /*Extract*/ false, CostKind);
+    }
+  }
   if (DuplicateNonConst)
     Cost += ::getShuffleCost(*TTI, TargetTransformInfo::SK_PermuteSingleSrc,
                              VecTy, ShuffleMask);
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
new file mode 100644
index 00000000000000..648eb5bc5119dc
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/revec-getGatherCost.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-10 %s | FileCheck %s
+
+define void @test(<4 x float> %load6, <4 x float> %load7, <4 x float> %load8, <4 x float> %load17, <4 x float> %fmuladd7, <4 x float> %fmuladd16, ptr %out_ptr) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[VEXT165_I:%.*]] = shufflevector <4 x float> [[LOAD6:%.*]], <4 x float> [[LOAD7:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[VEXT309_I:%.*]] = shufflevector <4 x float> [[LOAD7]], <4 x float> [[LOAD8:%.*]], <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[VEXT165_I]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP0]], <4 x float> [[VEXT309_I]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> poison, i64 4)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP2]], <4 x float> [[LOAD17:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[FMULADD7:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP5]], <4 x float> [[FMULADD16:%.*]], i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP1]], <8 x float> [[TMP4]], <8 x float> [[TMP6]])
+; CHECK-NEXT:    store <8 x float> [[TMP7]], ptr [[OUT_PTR:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %vext165.i = shufflevector <4 x float> %load6, <4 x float> %load7, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %vext309.i = shufflevector <4 x float> %load7, <4 x float> %load8, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
+  %fmuladd8 = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> %vext165.i, <4 x float> %load17, <4 x float> %fmuladd7)
+  %fmuladd17 = tail call noundef <4 x float> @llvm.fmuladd.v4f32(<4 x float> %vext309.i, <4 x float> %load17, <4 x float> %fmuladd16)
+  %add.ptr.i.i = getelementptr inbounds i8, ptr %out_ptr, i64 16
+  store <4 x float> %fmuladd8, ptr %out_ptr, align 4
+  store <4 x float> %fmuladd17, ptr %add.ptr.i.i, align 4
+  ret void
+}
+
+declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)

@HanKuanChen HanKuanChen force-pushed the slp-revec-getGatherCost branch 2 times, most recently from 14101df to 5915170 Compare September 20, 2024 07:06
@@ -0,0 +1,31 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-revec -slp-max-reg-size=1024 -slp-threshold=-10 %s | FileCheck %s
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would be good to add remarks check to see cost changes

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

Copy link
Member

@alexey-bataev alexey-bataev left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LG

@HanKuanChen HanKuanChen merged commit 0062975 into llvm:main Sep 24, 2024
8 checks passed
@HanKuanChen HanKuanChen deleted the slp-revec-getGatherCost branch September 24, 2024 00:08
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants