Skip to content

Commit d3bc834

Browse files
committed
[LV] Update check to find epilogue resume value to check all incoming.
This fixes a crash where all incoming values for the epilogue resume value are zero, because there are no remaining iterations to execute for the epilogue loop.
1 parent 1e60dd4 commit d3bc834

File tree

2 files changed

+150
-1
lines changed

2 files changed

+150
-1
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9765,7 +9765,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
97659765
match(
97669766
P.getIncomingValueForBlock(EPI.MainLoopIterationCountCheck),
97679767
m_SpecificInt(0)) &&
9768-
is_contained(P.incoming_values(), EPI.VectorTripCount))
9768+
all_of(P.incoming_values(), [&EPI](Value *Inc) {
9769+
return Inc == EPI.VectorTripCount ||
9770+
match(Inc, m_SpecificInt(0));
9771+
}))
97699772
return &P;
97709773
return nullptr;
97719774
});
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5
2+
; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s
3+
4+
target triple = "aarch64-linux-gnu"
5+
6+
define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 {
7+
; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations(
8+
; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] {
9+
; CHECK-NEXT: [[ITER_CHECK:.*]]:
10+
; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
11+
; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2
12+
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP3]]
13+
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
14+
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
15+
; CHECK-NEXT: br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
16+
; CHECK: [[VECTOR_PH]]:
17+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
18+
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
19+
; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
20+
; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
21+
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
22+
; CHECK: [[VECTOR_BODY]]:
23+
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
24+
; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ]
25+
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3
26+
; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
27+
; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <64 x i8> [[WIDE_VEC2]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
28+
; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[STRIDED_VEC3]] to <16 x i32>
29+
; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP0]], <16 x i32> [[TMP6]])
30+
; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP8]])
31+
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
32+
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0
33+
; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP12]], align 1
34+
; CHECK-NEXT: [[TMP15:%.*]] = zext <16 x i32> [[TMP10]] to <16 x i64>
35+
; CHECK-NEXT: [[TMP17]] = or <16 x i64> [[VEC_PHI1]], [[TMP15]]
36+
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
37+
; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
38+
; CHECK: [[MIDDLE_BLOCK]]:
39+
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP17]])
40+
; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]]
41+
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
42+
; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
43+
; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2
44+
; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP14]]
45+
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
46+
; CHECK: [[VEC_EPILOG_PH]]:
47+
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
48+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
49+
; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
50+
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP31]], 2
51+
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP16]]
52+
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
53+
; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP32]], i64 [[TMP16]], i64 [[N_MOD_VF]]
54+
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 16, [[TMP36]]
55+
; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
56+
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2
57+
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
58+
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
59+
; CHECK-NEXT: [[TMP21:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
60+
; CHECK-NEXT: [[TMP22:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
61+
; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
62+
; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
63+
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0
64+
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
65+
; CHECK-NEXT: [[TMP25:%.*]] = mul <vscale x 2 x i64> [[TMP24]], splat (i64 1)
66+
; CHECK-NEXT: [[INDUCTION:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP25]]
67+
; CHECK-NEXT: [[TMP37:%.*]] = mul i64 1, [[TMP20]]
68+
; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP37]], i64 0
69+
; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT4]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
70+
; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
71+
; CHECK: [[VEC_EPILOG_VECTOR_BODY]]:
72+
; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
73+
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 2 x i64> [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
74+
; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <vscale x 2 x i64> [ [[TMP21]], %[[VEC_EPILOG_PH]] ], [ [[TMP34:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
75+
; CHECK-NEXT: [[TMP38:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
76+
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP38]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
77+
; CHECK-NEXT: [[TMP28:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
78+
; CHECK-NEXT: [[TMP29:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP22]], <vscale x 2 x i32> [[TMP28]])
79+
; CHECK-NEXT: [[TMP39:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
80+
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX6]]
81+
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0
82+
; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP27]], align 1
83+
; CHECK-NEXT: [[TMP33:%.*]] = zext <vscale x 2 x i32> [[TMP39]] to <vscale x 2 x i64>
84+
; CHECK-NEXT: [[TMP34]] = or <vscale x 2 x i64> [[VEC_PHI6]], [[TMP33]]
85+
; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], [[TMP20]]
86+
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 2 x i64> [[VEC_IND]], [[DOTSPLAT5]]
87+
; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC]]
88+
; CHECK-NEXT: br i1 [[TMP35]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
89+
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
90+
; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64(<vscale x 2 x i64> [[TMP34]])
91+
; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]]
92+
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
93+
; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
94+
; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i64 [ [[TMP30]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
95+
; CHECK-NEXT: br label %[[LOOP:.*]]
96+
; CHECK: [[LOOP]]:
97+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL8]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
98+
; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
99+
; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3
100+
; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1
101+
; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32
102+
; CHECK-NEXT: [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
103+
; CHECK-NEXT: [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]])
104+
; CHECK-NEXT: [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false)
105+
; CHECK-NEXT: [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]])
106+
; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]]
107+
; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1
108+
; CHECK-NEXT: [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64
109+
; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]]
110+
; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
111+
; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 16
112+
; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
113+
; CHECK: [[EXIT]]:
114+
; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
115+
; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]]
116+
;
117+
entry:
118+
br label %loop
119+
120+
loop:
121+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
122+
%red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
123+
%gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3
124+
%l = load i8, ptr %gep.src.i.i, align 1
125+
%l.ext = zext i8 %l to i32
126+
%abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false)
127+
%min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext)
128+
%abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false)
129+
%min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0)
130+
%gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv
131+
store i8 0, ptr %gep.dst, align 1
132+
%min.ext = zext i32 %min.1 to i64
133+
%red.next = or i64 %red, %min.ext
134+
%iv.next = add i64 %iv, 1
135+
%exitcond.not.i.i = icmp eq i64 %iv.next, 16
136+
br i1 %exitcond.not.i.i, label %exit, label %loop
137+
138+
exit:
139+
ret i64 %red.next
140+
}
141+
142+
declare i32 @llvm.umin.i32(i32, i32)
143+
144+
declare i32 @llvm.abs.i32(i32, i1 immarg)
145+
146+
attributes #0 = { "target-cpu"="neoverse-512tvb" }

0 commit comments

Comments
 (0)