-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LV] Amend check for IV increments in collectUsersInEntryBlock #108020
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-transforms Author: Hari Limaye (hazzlim) Changes…lock The check for IV increments in collectUsersInEntryBlock currently triggers for exit-block PHIs which use the IV start value, resulting in us failing to add the input value for the middle block to these PHIs. Fix this by amending the check for IV increments to only include incoming values that are instructions inside the loop. Fixes #108004 Full diff: https://github.com/llvm/llvm-project/pull/108020.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2be3b577529258..054a170c36863c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8662,6 +8662,7 @@ static MapVector<PHINode *, VPValue *> collectUsersInExitBlock(
!cast<VPWidenIntOrFpInductionRecipe>(V)->getTruncInst()) ||
isa<VPWidenPointerInductionRecipe>(V) ||
(isa<Instruction>(IncomingValue) &&
+ OrigLoop->contains(cast<Instruction>(IncomingValue)) &&
any_of(IncomingValue->users(), [&Inductions](User *U) {
auto *P = dyn_cast<PHINode>(U);
return P && Inductions.contains(P);
diff --git a/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
new file mode 100644
index 00000000000000..c7ea86533570b0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -S | FileCheck %s
+
+; Check that we correctly handle the use of %start2 in the exit block, and do
+; not crash.
+
+define i64 @foo(i64 %start, i64 %end) {
+; CHECK-LABEL: define i64 @foo(
+; CHECK-SAME: i64 [[START:%.*]], i64 [[END:%.*]]) {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[P1:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: [[P2:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT: [[START2:%.*]] = and i64 [[START]], 12345
+; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[END]], [[START2]]
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT: [[IND_END:%.*]] = add i64 [[START2]], [[N_VEC]]
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[OFFSET_IDX:%.*]] = add i64 [[START2]], [[INDEX]]
+; CHECK-NEXT: [[IND:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND]]
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX1]], i32 0
+; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND]]
+; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START2]], %[[ENTRY]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[IND1:%.*]] = phi i64 [ [[IND_NEXT1:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND1]]
+; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND1]]
+; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT: [[IND_NEXT1]] = add i64 [[IND1]], 1
+; CHECK-NEXT: [[CMP:%.*]] = icmp ne i64 [[IND_NEXT1]], [[END]]
+; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK: [[EXIT]]:
+; CHECK-NEXT: [[USE:%.*]] = phi i64 [ [[START2]], %[[FOR_BODY]] ], [ [[START2]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: ret i64 [[USE]]
+;
+entry:
+ %p1 = alloca [1024 x i32]
+ %p2 = alloca [1024 x i32]
+ call void @init_mem(ptr %p1, i64 1024)
+ call void @init_mem(ptr %p2, i64 1024)
+ %start2 = and i64 %start, 12345
+ br label %for.body
+
+for.body:
+ %ind = phi i64 [ %ind.next, %for.body ], [ %start2, %entry ]
+ %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %ind
+ %0 = load i32, ptr %arrayidx1, align 4
+ %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %ind
+ %1 = load i32, ptr %arrayidx2, align 4
+ %ind.next = add i64 %ind, 1
+ %cmp = icmp ne i64 %ind.next, %end
+ br i1 %cmp, label %for.body, label %exit
+
+exit:
+ %use = phi i64 [ %start2, %for.body ]
+ ret i64 %use
+}
+
+declare void @init_mem(ptr, i64)
+
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
|
; CHECK-NEXT: ret i64 [[USE]] | ||
; | ||
entry: | ||
%p1 = alloca [1024 x i32] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can p1
/p2
instead be passed as function arguments, removing the need to call @init_mem
?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point - I've updated the test to pass p1 and p2 as function arguments.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks for the fix!
br label %for.body | ||
|
||
for.body: | ||
%ind = phi i64 [ %ind.next, %for.body ], [ %start2, %entry ] |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
%ind = phi i64 [ %ind.next, %for.body ], [ %start2, %entry ] | |
%ind = phi i64 [ %start2, %entry ], [ %ind.next, %for.body ] |
nit: seems more natural to have the incoming entry for %entry
first
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done :)
(looks like the title line of the PR overflowed to the description. Might be good to shorten the title, e.g. |
The check for IV increments in collectUsersInEntryBlock currently triggers for exit-block PHIs which use the IV start value, resulting in us failing to add the input value for the middle block to these PHIs. Fix this by amending the check for IV increments to only include incoming values that are instructions inside the loop.
78656e4
to
65774f5
Compare
Nice - done. Thank you for reviewing - I will land this once checks have passed. |
The check for IV increments in collectUsersInEntryBlock currently triggers for exit-block PHIs which use the IV start value, resulting in us failing to add the input value for the middle block to these PHIs.
Fix this by amending the check for IV increments to only include incoming values that are instructions inside the loop.
Fixes #108004