-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SLP] Enable optimization of freeze instructions #102217
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SLP] Enable optimization of freeze instructions #102217
Conversation
@llvm/pr-subscribers-llvm-transforms Author: John McIver (jmciver) ChangesAllow SLP optimization to progress in the presence of freeze instructions. Prior The following URL shows correctness of the addsub_freeze test: Full diff: https://github.com/llvm/llvm-project/pull/102217.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index ed9dfa66dc0b5..cea58ab34a882 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6468,6 +6468,7 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState(
case Instruction::And:
case Instruction::Or:
case Instruction::Xor:
+ case Instruction::Freeze:
return TreeEntry::Vectorize;
case Instruction::GetElementPtr: {
// We don't combine GEPs with complicated (nested) indexing.
@@ -7305,7 +7306,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor: {
+ case Instruction::Xor:
+ case Instruction::Freeze: {
TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndices);
LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
@@ -9790,10 +9792,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
case Instruction::AShr:
case Instruction::And:
case Instruction::Or:
- case Instruction::Xor: {
+ case Instruction::Xor:
+ case Instruction::Freeze: {
auto GetScalarCost = [&](unsigned Idx) {
auto *VI = cast<Instruction>(UniqueValues[Idx]);
- unsigned OpIdx = isa<UnaryOperator>(VI) ? 0 : 1;
+ unsigned OpIdx = isa<UnaryOperator>(VI) || isa<FreezeInst>(VI) ? 0 : 1;
TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(VI->getOperand(0));
TTI::OperandValueInfo Op2Info =
TTI::getOperandInfo(VI->getOperand(OpIdx));
@@ -9812,7 +9815,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
return CommonCost;
}
}
- unsigned OpIdx = isa<UnaryOperator>(VL0) ? 0 : 1;
+ unsigned OpIdx = isa<UnaryOperator>(VL0) || isa<FreezeInst>(VL0) ? 0 : 1;
TTI::OperandValueInfo Op1Info = getOperandInfo(E->getOperand(0));
TTI::OperandValueInfo Op2Info = getOperandInfo(E->getOperand(OpIdx));
return TTI->getArithmeticInstrCost(ShuffleOrOp, VecTy, CostKind, Op1Info,
@@ -13298,6 +13301,24 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E, bool PostponedPHIs) {
return V;
}
+ case Instruction::Freeze: {
+ setInsertPointAfterBundle(E);
+
+ Value *Op = vectorizeOperand(E, 0, PostponedPHIs);
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Value *V = Builder.CreateFreeze(Op);
+ V = FinalShuffle(V, E, VecTy);
+
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
case Instruction::Add:
case Instruction::FAdd:
case Instruction::Sub:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
index 5f8941e9f8893..f7bd2431a7605 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/addsub.ll
@@ -65,6 +65,62 @@ entry:
ret void
}
+define void @addsub_freeze() #0 {
+; CHECK-LABEL: @addsub_freeze(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr @b, align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr @c, align 4
+; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr @d, align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr @e, align 4
+; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub nsw <4 x i32> [[TMP2]], [[TMP5]]
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: [[TMP9:%.*]] = freeze <4 x i32> [[TMP8]]
+; CHECK-NEXT: store <4 x i32> [[TMP9]], ptr @a, align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %0 = load i32, ptr @b, align 4
+ %1 = load i32, ptr @c, align 4
+ %add = add nsw i32 %0, %1
+ %2 = load i32, ptr @d, align 4
+ %3 = load i32, ptr @e, align 4
+ %add1 = add nsw i32 %2, %3
+ %add2 = add nsw i32 %add, %add1
+ %freeze.add2 = freeze i32 %add2
+ store i32 %freeze.add2, ptr @a, align 4
+ %4 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 1), align 4
+ %5 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 1), align 4
+ %add3 = add nsw i32 %4, %5
+ %6 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 1), align 4
+ %7 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 1), align 4
+ %add4 = add nsw i32 %6, %7
+ %sub = sub nsw i32 %add3, %add4
+ %freeze.sub = freeze i32 %sub
+ store i32 %freeze.sub, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 1), align 4
+ %8 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 2), align 4
+ %9 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 2), align 4
+ %add5 = add nsw i32 %8, %9
+ %10 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 2), align 4
+ %11 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 2), align 4
+ %add6 = add nsw i32 %10, %11
+ %add7 = add nsw i32 %add5, %add6
+ %freeze.add7 = freeze i32 %add7
+ store i32 %freeze.add7, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 2), align 4
+ %12 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @b, i32 0, i64 3), align 4
+ %13 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @c, i32 0, i64 3), align 4
+ %add8 = add nsw i32 %12, %13
+ %14 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @d, i32 0, i64 3), align 4
+ %15 = load i32, ptr getelementptr inbounds ([4 x i32], ptr @e, i32 0, i64 3), align 4
+ %add9 = add nsw i32 %14, %15
+ %sub10 = sub nsw i32 %add8, %add9
+ %freeze.sub10 = freeze i32 %sub10
+ store i32 %freeze.sub10, ptr getelementptr inbounds ([4 x i32], ptr @a, i32 0, i64 3), align 4
+ ret void
+}
+
; Function Attrs: nounwind uwtable
define void @subadd() #0 {
; CHECK-LABEL: @subadd(
@@ -301,14 +357,14 @@ define void @reorder_alt_subTree() #0 {
define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, ptr noalias nocapture readonly %d) {
; CHECK-LABEL: @reorder_alt_rightsubTree(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[D:%.*]], align 8
-; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT: [[TMP6:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
-; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x double> [[TMP4]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = fsub <2 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT: [[TMP9:%.*]] = fadd <2 x double> [[TMP7]], [[TMP2]]
-; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT: store <2 x double> [[TMP10]], ptr [[C:%.*]], align 8
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[D:%.*]], align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <2 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP4]], [[TMP1]]
+; CHECK-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP4]], [[TMP1]]
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT: store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8
; CHECK-NEXT: ret void
;
%1 = load double, ptr %a
@@ -332,20 +388,20 @@ define void @reorder_alt_rightsubTree(ptr nocapture %c, ptr noalias nocapture re
define void @vec_shuff_reorder() #0 {
; CHECK-LABEL: @vec_shuff_reorder(
-; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fa, align 4
-; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr @fb, align 4
-; CHECK-NEXT: [[TMP5:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
-; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr @fa, align 4
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr @fb, align 4
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fb, i32 0, i64 2), align 4
+; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, ptr getelementptr inbounds ([4 x float], ptr @fa, i32 0, i64 2), align 4
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP5]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP9]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
-; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x float> [[TMP12]], <4 x float> [[TMP13]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT: [[TMP15:%.*]] = fadd <4 x float> [[TMP10]], [[TMP14]]
-; CHECK-NEXT: [[TMP16:%.*]] = fsub <4 x float> [[TMP10]], [[TMP14]]
-; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEXT: store <4 x float> [[TMP17]], ptr @fc, align 4
+; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x float> [[TMP7]], [[TMP10]]
+; CHECK-NEXT: [[TMP12:%.*]] = fsub <4 x float> [[TMP7]], [[TMP10]]
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP12]], <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEXT: store <4 x float> [[TMP13]], ptr @fc, align 4
; CHECK-NEXT: ret void
;
%1 = load float, ptr @fb, align 4
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
index 28e837c2d7a4e..1804ef5e37833 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/fmuladd.ll
@@ -46,6 +46,31 @@ define void @fmuladd_2f64() #0 {
ret void
}
+define void @fmuladd_2f64_freeze() #0 {
+; CHECK-LABEL: @fmuladd_2f64_freeze(
+; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @srcA64, align 8
+; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr @srcB64, align 8
+; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr @srcC64, align 8
+; CHECK-NEXT: [[TMP4:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP1]], <2 x double> [[TMP2]], <2 x double> [[TMP3]])
+; CHECK-NEXT: [[TMP5:%.*]] = freeze <2 x double> [[TMP4]]
+; CHECK-NEXT: store <2 x double> [[TMP5]], ptr @dst64, align 8
+; CHECK-NEXT: ret void
+;
+ %a0 = load double, ptr @srcA64, align 8
+ %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @srcA64, i32 0, i64 1), align 8
+ %b0 = load double, ptr @srcB64, align 8
+ %b1 = load double, ptr getelementptr inbounds ([8 x double], ptr @srcB64, i32 0, i64 1), align 8
+ %c0 = load double, ptr @srcC64, align 8
+ %c1 = load double, ptr getelementptr inbounds ([8 x double], ptr @srcC64, i32 0, i64 1), align 8
+ %fmuladd0 = call double @llvm.fmuladd.f64(double %a0, double %b0, double %c0)
+ %fmuladd1 = call double @llvm.fmuladd.f64(double %a1, double %b1, double %c1)
+ %freeze0 = freeze double %fmuladd0
+ %freeze1 = freeze double %fmuladd1
+ store double %freeze0, ptr @dst64, align 8
+ store double %freeze1, ptr getelementptr inbounds ([8 x double], ptr @dst64, i32 0, i64 1), align 8
+ ret void
+}
+
define void @fmuladd_4f64() #0 {
; SSE-LABEL: @fmuladd_4f64(
; SSE-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @srcA64, align 8
|
@@ -9790,10 +9792,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals, | |||
case Instruction::AShr: | |||
case Instruction::And: | |||
case Instruction::Or: | |||
case Instruction::Xor: { | |||
case Instruction::Xor: | |||
case Instruction::Freeze: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I rather doubt it should be here. I think the cost of freeze must be 0 always
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the feedback. You are correct. I will get that fixed.
Allow SLP optimization to progress in the presence of freeze instructions. Prior to this commit, freeze instructions blocked SLP optimization. The following URL shows correctness of the addsub_freeze test: https://alive2.llvm.org/ce/z/qm38oh
bc22572
to
0ca3ab6
Compare
case Instruction::Freeze: { | ||
return CommonCost; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Drop braces
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG
@alexey-bataev I currently do not have committer access. Can you please land the pull request for me? Would you like me to squash + push first? |
Allow SLP optimization to progress in the presence of freeze instructions. Prior to this commit, freeze instructions blocked SLP optimization. The following URL shows correctness of the addsub_freeze test: https://alive2.llvm.org/ce/z/qm38oh
Allow SLP optimization to progress in the presence of freeze instructions. Prior
to this commit, freeze instructions blocked SLP optimization.
The following URL shows correctness of the addsub_freeze test:
https://alive2.llvm.org/ce/z/qm38oh