-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[SelectOpt] Support ADD and SUB with zext operands. #115489
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Florian Hahn (fhahn) ChangesExtend the support for implicit selects in the form of OR with a ZExt operand to support ADD and SUB binops as well. They similarly can form implicit selects which can be profitable to convert back the branches. Full diff: https://github.com/llvm/llvm-project/pull/115489.diff 3 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 9587534d1170fc..ce3c90d43391d9 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -145,11 +145,20 @@ class SelectOptimizeImpl {
// An Or(zext(i1 X), Y) can also be treated like a select, with condition
// C and values Y|1 and Y.
- Value *X;
- if (PatternMatch::match(
- I, m_c_Or(m_OneUse(m_ZExt(m_Value(X))), m_Value())) &&
- X->getType()->isIntegerTy(1))
- return SelectLike(I);
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Or:
+ case Instruction::Sub: {
+ Value *X;
+ if ((PatternMatch::match(I->getOperand(0),
+ m_OneUse(m_ZExt(m_Value(X)))) ||
+ PatternMatch::match(I->getOperand(1),
+ m_OneUse(m_ZExt(m_Value(X))))) &&
+ X->getType()->isIntegerTy(1))
+ return SelectLike(I);
+ break;
+ }
+ }
return SelectLike(nullptr);
}
@@ -250,19 +259,22 @@ class SelectOptimizeImpl {
: Scaled64::getZero();
}
- // Or case - add the cost of an extra Or to the cost of the False case.
- if (isa<BinaryOperator>(I))
- if (auto I = dyn_cast<Instruction>(getFalseValue())) {
+ // BinaryOp case - add the cost of an extra BinOp to the cost of the False
+ // case.
+ if (isa<BinaryOperator>(I)) {
+ if (auto OpI = dyn_cast<Instruction>(getFalseValue())) {
auto It = InstCostMap.find(I);
if (It != InstCostMap.end()) {
InstructionCost OrCost = TTI->getArithmeticInstrCost(
- Instruction::Or, I->getType(), TargetTransformInfo::TCK_Latency,
+ I->getOpcode(), OpI->getType(),
+ TargetTransformInfo::TCK_Latency,
{TargetTransformInfo::OK_AnyValue,
TargetTransformInfo::OP_None},
{TTI::OK_UniformConstantValue, TTI::OP_PowerOf2});
return It->second.NonPredCost + Scaled64::get(*OrCost.getValue());
}
}
+ }
return Scaled64::getZero();
}
@@ -548,12 +560,16 @@ getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
V = (!isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
}
- if (isa<BinaryOperator>(SI.getI())) {
- assert(SI.getI()->getOpcode() == Instruction::Or &&
- "Only currently handling Or instructions.");
+ if (auto *BinOp = dyn_cast<BinaryOperator>(SI.getI())) {
+ assert((BinOp->getOpcode() == Instruction::Add ||
+ BinOp->getOpcode() == Instruction::Or ||
+ BinOp->getOpcode() == Instruction::Sub) &&
+ "Only currently handling Add, Or and Sub instructions.");
V = SI.getFalseValue();
- if (isTrue)
- V = IB.CreateOr(V, ConstantInt::get(V->getType(), 1));
+ if (isTrue) {
+ Constant *CI = ConstantInt::get(V->getType(), 1);
+ V = IB.CreateBinOp(BinOp->getOpcode(), V, CI);
+ }
}
assert(V && "Failed to get select true/false value");
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 71f9bbbbc35041..75eea291896ef7 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4678,14 +4678,20 @@ AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
}
bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
- // For the binary operators (e.g. or) we need to be more careful than
- // selects, here we only transform them if they are already at a natural
- // break point in the code - the end of a block with an unconditional
- // terminator.
- if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
- isa<BranchInst>(I->getNextNode()) &&
- cast<BranchInst>(I->getNextNode())->isUnconditional())
- return true;
+ if (EnableOrLikeSelectOpt) {
+ // For the binary operators (e.g. or) we need to be more careful than
+ // selects, here we only transform them if they are already at a natural
+ // break point in the code - the end of a block with an unconditional
+ // terminator.
+ if (I->getOpcode() == Instruction::Or &&
+ isa<BranchInst>(I->getNextNode()) &&
+ cast<BranchInst>(I->getNextNode())->isUnconditional())
+ return true;
+
+ if (I->getOpcode() == Instruction::Add ||
+ I->getOpcode() == Instruction::Sub)
+ return true;
+ }
return BaseT::shouldTreatInstructionLikeSelect(I);
}
diff --git a/llvm/test/CodeGen/AArch64/selectopt-cast.ll b/llvm/test/CodeGen/AArch64/selectopt-cast.ll
index b1804a02ec1c82..3dfb72b299c2ea 100644
--- a/llvm/test/CodeGen/AArch64/selectopt-cast.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt-cast.ll
@@ -7,16 +7,22 @@ define void @test_add_zext(ptr %dst, ptr %src, i64 %j.start, i64 %p, i64 %i.star
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[HIGH:%.*]], [[ENTRY]] ], [ [[J_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_START:%.*]], [[ENTRY]] ], [ [[J_NEXT]], [[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[HIGH:%.*]], [[ENTRY]] ], [ [[J_NEXT:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_START:%.*]], [[ENTRY]] ], [ [[J_NEXT]], [[SELECT_END]] ]
; CHECK-NEXT: [[GEP_I:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
; CHECK-NEXT: [[L_I:%.*]] = load ptr, ptr [[GEP_I]], align 8
; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
; CHECK-NEXT: [[L_J:%.*]] = load ptr, ptr [[GEP_J]], align 8
; CHECK-NEXT: [[CMP3:%.*]] = icmp ult ptr [[L_I]], [[L_J]]
; CHECK-NEXT: [[DEC:%.*]] = zext i1 [[CMP3]] to i64
-; CHECK-NEXT: [[J_NEXT]] = add nsw i64 [[J]], [[DEC]]
+; CHECK-NEXT: [[CMP3_FROZEN:%.*]] = freeze i1 [[CMP3]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[J]], 1
+; CHECK-NEXT: br i1 [[CMP3_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK: select.false:
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[J_NEXT]] = phi i64 [ [[TMP0]], [[LOOP]] ], [ [[J]], [[SELECT_FALSE]] ]
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[IV]]
; CHECK-NEXT: store i64 [[J_NEXT]], ptr [[GEP_DST]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
@@ -54,9 +60,9 @@ define void @test_add_zext_not(ptr %dst, ptr %src, i64 %j.start, i64 %p, i64 %i.
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[HIGH:%.*]], [[ENTRY]] ], [ [[J_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_START:%.*]], [[ENTRY]] ], [ [[J_NEXT]], [[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[HIGH:%.*]], [[ENTRY]] ], [ [[J_NEXT:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_START:%.*]], [[ENTRY]] ], [ [[J_NEXT]], [[SELECT_END]] ]
; CHECK-NEXT: [[GEP_I:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
; CHECK-NEXT: [[L_I:%.*]] = load ptr, ptr [[GEP_I]], align 8
; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
@@ -64,7 +70,13 @@ define void @test_add_zext_not(ptr %dst, ptr %src, i64 %j.start, i64 %p, i64 %i.
; CHECK-NEXT: [[CMP3:%.*]] = icmp ult ptr [[L_I]], [[L_J]]
; CHECK-NEXT: [[NOT_CMP3:%.*]] = xor i1 [[CMP3]], true
; CHECK-NEXT: [[DEC:%.*]] = zext i1 [[NOT_CMP3]] to i64
-; CHECK-NEXT: [[J_NEXT]] = add nsw i64 [[J]], [[DEC]]
+; CHECK-NEXT: [[NOT_CMP3_FROZEN:%.*]] = freeze i1 [[NOT_CMP3]]
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[J]], 1
+; CHECK-NEXT: br i1 [[NOT_CMP3_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK: select.false:
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[J_NEXT]] = phi i64 [ [[TMP0]], [[LOOP]] ], [ [[J]], [[SELECT_FALSE]] ]
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[IV]]
; CHECK-NEXT: store i64 [[J_NEXT]], ptr [[GEP_DST]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
@@ -308,16 +320,22 @@ define void @test_sub_zext(ptr %dst, ptr %src, i64 %j.start, i64 %p, i64 %i.star
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[HIGH:%.*]], [[ENTRY]] ], [ [[J_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_START:%.*]], [[ENTRY]] ], [ [[J_NEXT]], [[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[HIGH:%.*]], [[ENTRY]] ], [ [[J_NEXT:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_START:%.*]], [[ENTRY]] ], [ [[J_NEXT]], [[SELECT_END]] ]
; CHECK-NEXT: [[GEP_I:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
; CHECK-NEXT: [[L_I:%.*]] = load ptr, ptr [[GEP_I]], align 8
; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
; CHECK-NEXT: [[L_J:%.*]] = load ptr, ptr [[GEP_J]], align 8
; CHECK-NEXT: [[CMP3:%.*]] = icmp ult ptr [[L_I]], [[L_J]]
; CHECK-NEXT: [[DEC:%.*]] = zext i1 [[CMP3]] to i64
-; CHECK-NEXT: [[J_NEXT]] = sub nsw i64 [[J]], [[DEC]]
+; CHECK-NEXT: [[CMP3_FROZEN:%.*]] = freeze i1 [[CMP3]]
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[J]], 1
+; CHECK-NEXT: br i1 [[CMP3_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK: select.false:
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[J_NEXT]] = phi i64 [ [[TMP0]], [[LOOP]] ], [ [[J]], [[SELECT_FALSE]] ]
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[IV]]
; CHECK-NEXT: store i64 [[J_NEXT]], ptr [[GEP_DST]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
@@ -355,9 +373,9 @@ define void @test_sub_zext_not(ptr %dst, ptr %src, i64 %j.start, i64 %p, i64 %i.
; CHECK-NEXT: entry:
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
-; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[HIGH:%.*]], [[ENTRY]] ], [ [[J_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_START:%.*]], [[ENTRY]] ], [ [[J_NEXT]], [[LOOP]] ]
+; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[SELECT_END:%.*]] ]
+; CHECK-NEXT: [[J:%.*]] = phi i64 [ [[HIGH:%.*]], [[ENTRY]] ], [ [[J_NEXT:%.*]], [[SELECT_END]] ]
+; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[I_START:%.*]], [[ENTRY]] ], [ [[J_NEXT]], [[SELECT_END]] ]
; CHECK-NEXT: [[GEP_I:%.*]] = getelementptr inbounds ptr, ptr [[SRC:%.*]], i64 [[I]]
; CHECK-NEXT: [[L_I:%.*]] = load ptr, ptr [[GEP_I]], align 8
; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr inbounds ptr, ptr [[SRC]], i64 [[J]]
@@ -365,7 +383,13 @@ define void @test_sub_zext_not(ptr %dst, ptr %src, i64 %j.start, i64 %p, i64 %i.
; CHECK-NEXT: [[CMP3:%.*]] = icmp ult ptr [[L_I]], [[L_J]]
; CHECK-NEXT: [[NOT_CMP3:%.*]] = xor i1 [[CMP3]], true
; CHECK-NEXT: [[DEC:%.*]] = zext i1 [[NOT_CMP3]] to i64
-; CHECK-NEXT: [[J_NEXT]] = sub nsw i64 [[J]], [[DEC]]
+; CHECK-NEXT: [[NOT_CMP3_FROZEN:%.*]] = freeze i1 [[NOT_CMP3]]
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[J]], 1
+; CHECK-NEXT: br i1 [[NOT_CMP3_FROZEN]], label [[SELECT_END]], label [[SELECT_FALSE:%.*]]
+; CHECK: select.false:
+; CHECK-NEXT: br label [[SELECT_END]]
+; CHECK: select.end:
+; CHECK-NEXT: [[J_NEXT]] = phi i64 [ [[TMP0]], [[LOOP]] ], [ [[J]], [[SELECT_FALSE]] ]
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds ptr, ptr [[DST:%.*]], i64 [[IV]]
; CHECK-NEXT: store i64 [[J_NEXT]], ptr [[GEP_DST]], align 8
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea. LGTM.
This only enables it for AArch64, @sapostolakis not sure if you would like to look into enabling this for other architectures (I think the OR support is only enabled for AArch64 as well) |
Do you have performance results for this change? I was trying in the past to try and keep it relatively conservative, as based on the predictability of the branch it can make things better or worse. (We found a case when it was worse recently). We have been looking at some other selects that could benefit from the transform, but I had expected they would only be generally profitable once you considered them part of a larger select group. My understanding was that the Apple cpus did not use this option either. Do you intend to change that? |
Oh and theoretically a |
I am currently working on enabling it for Apple CPUs and for some motivating cases we need support for Overall enabling it with this change is neutral on SPEC and some other proprietary benchmarks. I agree that there is the risk that if we get it wrong the result can be pretty bad. But I don't think the I expect that as we support more cases, we are likely hitting additional cases where the cost-model gets things wrong. What do you think is the best way to make progress here? |
31a831e
to
7e5ca4e
Compare
Building on top of llvm#115489 extend support for binops with SExt operand.
That's good to hear. I was under the impression (without knowing any of the details) that they were at least less susceptible to the issues we see. Maybe they are present everywhere to some extent.
Yeah the add and or are very similar. We ruled out logical and/or in ca78b56, and recently prevented it from folding selects with two constant operands in 1789534. I also tried to make the or relatively constrained when I added it, only adding them where a branch already existed (a2d68b4). I am a bit suspicious of things that look like logic or arithmetic, as they are sometimes quicker as logic/arithmetic. The problem with the cost model is that it doesn't know how predictable the branch is, and so doesn't have full info available. My understanding is that well-predictable loops can benefit greatly from converting the selects, but in larger codebases that are already heavy on the number of branches it can introduce extra overhead.
Honestly - My hope was that they would fix it in the hardware :) I ran this and it didn't look too bad, but I didn't look into any specific case deeply. Do you have tests for commuted sub operands, where the first operand is the condition? |
As an example of something that I thought of with adds, a i128 addition using i64 values has a zext and add that we would usually hope to transform into an adc. https://godbolt.org/z/nY1xfzE5M. That will not transform due to the cost model, and would need more going on to transform, but would be the kind of thing we might need to worry about. |
I agree with @fhahn 's point that Add case is no different than the Or case and if there is a regression, the cost modeling would probably need to be adjusted. @davemgreen you are right that the predictability of the branch is a missing aspect here that could lead to incorrect cost modeling. The cost model will skip converting to branches if the select is annotated as unpredictable (either with user annotations or populated by other passes; e.g., recently there was a discussion of feeding back perf data for mispredictions), but there is no similar heuristic for highly-predictable branches and if I am not mistaken not equivalent annotation (maybe we could add a predictable metadata and populate it based on user hints such as ABSL_PREDICT_TRUE or SPGO mispredict data). The default misprediction rate in the cost model is meant to be conservative and assumes some level of misprediction given that converting to a highly-mispredicting branch is a worse case scenario than a predictable select. |
Hi - Yeah the heuristic is trying to do the best it can with the information it has, it just won't always be perfect. @igogo-x86 mentioned that this was something he was planning to add anyway, so it sounds OK as it will likely be added either here or in another patch. If it was me I might make it more conservative (maybe only do it if it was part of a larger select group), but I don't know of any cases where this will directly cause problems. The example above with i128 addition doesn't trigger on its own. The sub issue still remains as I believe they will be treated like a commutative operation, so we would need to make sure it was the second operand that was the zext/sext. |
7e5ca4e
to
3d0fde8
Compare
Extra tests for llvm#115489 with different operand order. Also fixes the target triple.
Extra tests for #115489 with different operand order. Also fixes the target triple.
Extend the support for implicit selects in the form of OR with a ZExt operand to support ADD and SUB binops as well. They similarly can form implicit selects which can be profitable to convert back the branches.
3d0fde8
to
76ce3b6
Compare
Thanks, updated the patch after @igogo-x86 latest patch landing again, and I think the operands are added in the correct order (e.g. as in |
Building on top of llvm#115489 extend support for binops with SExt operand.
Heads up: we see a number of test failures on aarch64 after this commit. Might be a miscompilation. A reduced test case is in the works. |
Thanks for the fix! It resolves the issues we're seeing. |
Extra tests for llvm#115489 with different operand order. Also fixes the target triple. (cherry picked from commit f8f238d)
Extend the support for implicit selects in the form of OR with a ZExt operand to support ADD and SUB binops as well. They similarly can form implicit selects which can be profitable to convert back the branches. PR: llvm#115489 (cherry picked from commit 9a0f251)
Building on top of llvm#115489 extend support for binops with SExt operand. PR: llvm#115879 (cherry picked from commit c1f5937)
Extend the support for implicit selects in the form of OR with a ZExt operand to support ADD and SUB binops as well. They similarly can form implicit selects which can be profitable to convert back the branches.