-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV] Check isPredInst instead of isScalarWithPred in uniform analysis. #98892
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesAny instruction marked as uniform will result in a uniform VPReplicateRecipe. If it requires predication, it will be placed in a replicate region, even if isScalarWithPredication returns false. Check isPredicatedInst instead of isScalarWithPredication to avoid generating uniform VPReplicateRecipes placed inside a replicate region. This fixes an assertion when using scalable VFs. Full diff: https://github.com/llvm/llvm-project/pull/98892.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 5520baef7152d..296a346fa7ae1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3907,7 +3907,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
SetVector<Instruction *> Worklist;
// Add uniform instructions demanding lane 0 to the worklist. Instructions
- // that are scalar with predication must not be considered uniform after
+ // that are require predication must not be considered uniform after
// vectorization, because that would create an erroneous replicating region
// where only a single instance out of VF should be formed.
// TODO: optimize such seldom cases if found important, see PR40816.
@@ -3917,9 +3917,10 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
<< *I << "\n");
return;
}
- if (isScalarWithPredication(I, VF)) {
- LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
- << *I << "\n");
+ if (isPredicatedInst(I)) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Found not uniform due to requiring predication: " << *I
+ << "\n");
return;
}
LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
new file mode 100644
index 0000000000000..6fd74d1416873
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/divs-with-scalable-vfs.ll
@@ -0,0 +1,92 @@
+; RUN: opt -p loop-vectorize -mtriple aarch64 -mcpu=neoverse-v1 -S %s | FileCheck %s
+
+; Test case for https://github.com/llvm/llvm-project/issues/94328.
+define void @sdiv_feeding_gep(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
+entry:
+ %conv61 = zext i32 %x to i64
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %div18 = sdiv i64 %M, %conv6
+ %conv20 = trunc i64 %div18 to i32
+ %mul30 = mul i64 %div18, %conv61
+ %sub31 = sub i64 %iv, %mul30
+ %conv34 = trunc i64 %sub31 to i32
+ %mul35 = mul i32 %x, %conv20
+ %add36 = add i32 %mul35, %conv34
+ %idxprom = sext i32 %add36 to i64
+ %gep = getelementptr double, ptr %dst, i64 %idxprom
+ store double 0.000000e+00, ptr %gep, align 8
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %N
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+define void @sdiv_feeding_gep_predicated(ptr %dst, i32 %x, i64 %M, i64 %conv6, i64 %N) {
+entry:
+ %conv61 = zext i32 %x to i64
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+ %c = icmp ule i64 %iv, %M
+ br i1 %c, label %then, label %loop.latch
+
+then:
+ %div18 = sdiv i64 %M, %conv6
+ %conv20 = trunc i64 %div18 to i32
+ %mul30 = mul i64 %div18, %conv61
+ %sub31 = sub i64 %iv, %mul30
+ %conv34 = trunc i64 %sub31 to i32
+ %mul35 = mul i32 %x, %conv20
+ %add36 = add i32 %mul35, %conv34
+ %idxprom = sext i32 %add36 to i64
+ %gep = getelementptr double, ptr %dst, i64 %idxprom
+ store double 0.000000e+00, ptr %gep, align 8
+ br label %loop.latch
+
+loop.latch:
+ %iv.next = add i64 %iv, 1
+ %ec = icmp eq i64 %iv.next, %N
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret void
+}
+
+; Test case for https://github.com/llvm/llvm-project/issues/80416.
+define void @udiv_urem_feeding_gep(i64 %x, ptr %dst, i64 %N) {
+entry:
+ %mul.1.i = mul i64 %x, %x
+ %mul.2.i = mul i64 %mul.1.i, %x
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %div.i = udiv i64 %iv, %mul.2.i
+ %rem.i = urem i64 %iv, %mul.2.i
+ %div.1.i = udiv i64 %rem.i, %mul.1.i
+ %rem.1.i = urem i64 %rem.i, %mul.1.i
+ %div.2.i = udiv i64 %rem.1.i, %x
+ %rem.2.i = urem i64 %rem.1.i, %x
+ %mul.i = mul i64 %x, %div.i
+ %add.i = add i64 %mul.i, %div.1.i
+ %mul.1.i9 = mul i64 %add.i, %x
+ %add.1.i = add i64 %mul.1.i9, %div.2.i
+ %mul.2.i11 = mul i64 %add.1.i, %x
+ %add.2.i = add i64 %mul.2.i11, %rem.2.i
+ %sext.i = shl i64 %add.2.i, 32
+ %conv6.i = ashr i64 %sext.i, 32
+ %gep = getelementptr i64, ptr %dst, i64 %conv6.i
+ store i64 %div.i, ptr %gep, align 4
+ %iv.next = add i64 %iv, 1
+ %exitcond.not = icmp eq i64 %iv, %N
+ br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+ ret void
+}
|
369b5ad
to
b19facf
Compare
✅ With the latest revision this PR passed the C/C++ code formatter. |
Any instruction marked as uniform will result in a uniform VPReplicateRecipe. If it requires predication, it will be placed in a replicate region, even if isScalarWithPredication returns false. Check isPredicatedInst instead of isScalarWithPredication to avoid generating uniform VPReplicateRecipes placed inside a replicate region. This fixes an assertion when using scalable VFs. Fixes llvm#80416. Fixes llvm#94328.
b19facf
to
b6ab904
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LG with a nit
@@ -3907,7 +3907,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { | |||
SetVector<Instruction *> Worklist; | |||
|
|||
// Add uniform instructions demanding lane 0 to the worklist. Instructions | |||
// that are scalar with predication must not be considered uniform after | |||
// that are require predication must not be considered uniform after |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
// that are require predication must not be considered uniform after | |
// that require predication must not be considered uniform after |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, fixed!
When folding the tail, at least one of the lanes must execute unconditionally. If the divisor is loop-invariant no predication is needed, as predication would not prevent the divide-by-0 on the executed lane. Depends on llvm#98892.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this suggest that isScalarWithPredication misses some candidates, namely isPredicatedInst()'s considered - and now turned down - by addToWorklistIfAllowed()? There may be a phase ordering issue in how scalars, uniforms, and predicated instructions are classified.
Adding some thoughts inline.
@@ -9482,6 +9483,8 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { | |||
void VPReplicateRecipe::execute(VPTransformState &State) { | |||
Instruction *UI = getUnderlyingInstr(); | |||
if (State.Instance) { // Generate a single instance. | |||
assert((State.VF.isScalar() || !isUniform()) && | |||
"uniform recipe shouldn't be predicated"); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could this assert be committed separately?
Better catch this earlier, say assert (!IsUniform || !IsPredicated) upon replicate recipe construction and/or at handleReplication()?
Note the if (IsUniform)
below - would be good to be consistent.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Added in 3717776 in handleReplication. There is some additional code there that converts predicated assumes to uniform for scalable vectors which may should be revisited. (Predicated assumes are later removed by a VPlan-to-VPlan transform)
@@ -3907,7 +3907,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { | |||
SetVector<Instruction *> Worklist; | |||
|
|||
// Add uniform instructions demanding lane 0 to the worklist. Instructions | |||
// that are scalar with predication must not be considered uniform after | |||
// that are require predication must not be considered uniform after | |||
// vectorization, because that would create an erroneous replicating region | |||
// where only a single instance out of VF should be formed. | |||
// TODO: optimize such seldom cases if found important, see PR40816. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do the github issues offer additional seldom cases of lost uniformity potentially worth optimizing if important, to be added to TODO?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After reading through it (On GH #40162), I don't think so. AFAICT there is some potential improvements by creating interleave groups (or at least their runtime guards) later. Remove the TODO here
@@ -0,0 +1,391 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth pre-committing these tests to first record their current xfail behavior?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done as 270f5e4
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: opt -p loop-vectorize -mtriple aarch64 -mcpu=neoverse-v1 -S %s | FileCheck %s | ||
|
||
; Test case for https://github.com/llvm/llvm-project/issues/94328. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wonder if a smaller reproducer could be devised?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unfortunately I wasn't able to
Add assertion ensuring invariant on construction, split off as suggested from #98892.
Summary: Add assertion ensuring invariant on construction, split off as suggested from #98892. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250783
Summary: Currently the tests crash, due to a VPReplicateRecipe getting predicated for scalable vectors. Precommits tests for #98892. Test cases for * #80416 and * #94328 Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251705
#98892) Summary: Any instruction marked as uniform will result in a uniform VPReplicateRecipe. If it requires predication, it will be placed in a replicate region, even if isScalarWithPredication returns false. Check isPredicatedInst instead of isScalarWithPredication to avoid generating uniform VPReplicateRecipes placed inside a replicate region. This fixes an assertion when using scalable VFs. Fixes #80416. Fixes #94328. Fixes #99625. PR: #98892 Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251530
…98904) Summary: When folding the tail, at least one of the lanes must execute unconditionally. If the divisor is loop-invariant no predication is needed, as predication would not prevent the divide-by-0 on the executed lane. Depends on #98892. PR: #98904 Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250648
Any instruction marked as uniform will result in a uniform VPReplicateRecipe. If it requires predication, it will be placed in a replicate region, even if isScalarWithPredication returns false.
Check isPredicatedInst instead of isScalarWithPredication to avoid generating uniform VPReplicateRecipes placed inside a replicate region. This fixes an assertion when using scalable VFs.
Fixes #80416.
Fixes #94328.
Fixes #99625.