@@ -48,22 +48,13 @@ entry:
48
48
vector.ph: ; preds = %entry
49
49
%n.rnd.up = add i32 %N , 3
50
50
%n.vec = and i32 %n.rnd.up , -4
51
- %trip.count.minus.1 = add i32 %N , -1
52
- %broadcast.splatinsert11 = insertelement <4 x i32 > undef , i32 %trip.count.minus.1 , i32 0
53
- %broadcast.splat12 = shufflevector <4 x i32 > %broadcast.splatinsert11 , <4 x i32 > undef , <4 x i32 > zeroinitializer
54
51
br label %vector.body
55
52
56
53
vector.body: ; preds = %vector.body, %vector.ph
57
54
%index = phi i32 [ 0 , %vector.ph ], [ %index.next , %vector.body ]
58
55
%vec.phi = phi <4 x i32 > [ zeroinitializer , %vector.ph ], [ %add , %vector.body ]
59
- %broadcast.splatinsert = insertelement <4 x i32 > undef , i32 %index , i32 0
60
- %broadcast.splat = shufflevector <4 x i32 > %broadcast.splatinsert , <4 x i32 > undef , <4 x i32 > zeroinitializer
61
- %induction = add <4 x i32 > %broadcast.splat , <i32 0 , i32 1 , i32 2 , i32 3 >
62
56
%tmp = getelementptr inbounds i32 , i32* %a , i32 %index
63
-
64
- ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
65
57
%tmp1 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %index , i32 %N )
66
-
67
58
%tmp2 = bitcast i32* %tmp to <4 x i32 >*
68
59
%wide.masked.load.a = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp2 , i32 4 , <4 x i1 > %tmp1 , <4 x i32 > undef )
69
60
%tmp3 = getelementptr inbounds i32 , i32* %b , i32 %index
@@ -147,22 +138,13 @@ entry:
147
138
vector.ph: ; preds = %entry
148
139
%n.rnd.up = add i32 %N , 3
149
140
%n.vec = and i32 %n.rnd.up , -4
150
- %trip.count.minus.1 = add i32 %N , -1
151
- %broadcast.splatinsert11 = insertelement <4 x i32 > undef , i32 %trip.count.minus.1 , i32 0
152
- %broadcast.splat12 = shufflevector <4 x i32 > %broadcast.splatinsert11 , <4 x i32 > undef , <4 x i32 > zeroinitializer
153
141
br label %vector.body
154
142
155
143
vector.body: ; preds = %vector.body, %vector.ph
156
144
%index = phi i32 [ 0 , %vector.ph ], [ %index.next , %vector.body ]
157
145
%vec.phi = phi <4 x i32 > [ zeroinitializer , %vector.ph ], [ %add , %vector.body ]
158
- %broadcast.splatinsert = insertelement <4 x i32 > undef , i32 %index , i32 0
159
- %broadcast.splat = shufflevector <4 x i32 > %broadcast.splatinsert , <4 x i32 > undef , <4 x i32 > zeroinitializer
160
- %induction = add <4 x i32 > %broadcast.splat , <i32 0 , i32 1 , i32 2 , i32 3 >
161
146
%tmp = getelementptr inbounds i32 , i32* %a , i32 %index
162
-
163
- ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
164
147
%tmp1 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %index , i32 %N )
165
-
166
148
%tmp2 = bitcast i32* %tmp to <4 x i32 >*
167
149
%wide.masked.load.a = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp2 , i32 4 , <4 x i1 > %tmp1 , <4 x i32 > undef )
168
150
%tmp3 = getelementptr inbounds i32 , i32* %b , i32 %index
@@ -205,13 +187,12 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
205
187
; CHECK-NEXT: cmp.w r12, #0
206
188
; CHECK-NEXT: beq .LBB2_4
207
189
; CHECK-NEXT: @ %bb.1: @ %vector.ph
208
- ; CHECK-NEXT: add.w r4, r12, #3
209
- ; CHECK-NEXT: vmov.i32 q1, #0x0
210
- ; CHECK-NEXT: bic r4, r4, #3
211
- ; CHECK-NEXT: sub.w lr, r4, #4
190
+ ; CHECK-NEXT: add.w lr, r12, #3
212
191
; CHECK-NEXT: movs r4, #1
192
+ ; CHECK-NEXT: bic lr, lr, #3
193
+ ; CHECK-NEXT: vmov.i32 q1, #0x0
194
+ ; CHECK-NEXT: sub.w lr, lr, #4
213
195
; CHECK-NEXT: add.w lr, r4, lr, lsr #2
214
- ; CHECK-NEXT: movs r4, #0
215
196
; CHECK-NEXT: dls lr, lr
216
197
; CHECK-NEXT: .LBB2_2: @ %vector.body
217
198
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
@@ -222,12 +203,11 @@ define dso_local i32 @and_mul_reduce_add(i32* noalias nocapture readonly %a, i32
222
203
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
223
204
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
224
205
; CHECK-NEXT: vsub.i32 q1, q2, q1
225
- ; CHECK-NEXT: adds r4 , #4
206
+ ; CHECK-NEXT: sub.w r12, r12 , #4
226
207
; CHECK-NEXT: vpsttt
227
208
; CHECK-NEXT: vcmpt.i32 eq, q1, zr
228
209
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
229
210
; CHECK-NEXT: vldrwt.u32 q2, [r2], #16
230
- ; CHECK-NEXT: sub.w r12, r12, #4
231
211
; CHECK-NEXT: vmul.i32 q1, q2, q1
232
212
; CHECK-NEXT: vadd.i32 q1, q1, q0
233
213
; CHECK-NEXT: le lr, .LBB2_2
@@ -249,22 +229,13 @@ entry:
249
229
vector.ph: ; preds = %entry
250
230
%n.rnd.up = add i32 %N , 3
251
231
%n.vec = and i32 %n.rnd.up , -4
252
- %trip.count.minus.1 = add i32 %N , -1
253
- %broadcast.splatinsert11 = insertelement <4 x i32 > undef , i32 %trip.count.minus.1 , i32 0
254
- %broadcast.splat12 = shufflevector <4 x i32 > %broadcast.splatinsert11 , <4 x i32 > undef , <4 x i32 > zeroinitializer
255
232
br label %vector.body
256
233
257
234
vector.body: ; preds = %vector.body, %vector.ph
258
235
%index = phi i32 [ 0 , %vector.ph ], [ %index.next , %vector.body ]
259
236
%vec.phi = phi <4 x i32 > [ zeroinitializer , %vector.ph ], [ %add , %vector.body ]
260
- %broadcast.splatinsert = insertelement <4 x i32 > undef , i32 %index , i32 0
261
- %broadcast.splat = shufflevector <4 x i32 > %broadcast.splatinsert , <4 x i32 > undef , <4 x i32 > zeroinitializer
262
- %induction = add <4 x i32 > %broadcast.splat , <i32 0 , i32 1 , i32 2 , i32 3 >
263
237
%tmp = getelementptr inbounds i32 , i32* %a , i32 %index
264
-
265
- ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
266
238
%tmp1 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %index , i32 %N )
267
-
268
239
%tmp2 = bitcast i32* %tmp to <4 x i32 >*
269
240
%wide.masked.load.a = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp2 , i32 4 , <4 x i1 > %tmp1 , <4 x i32 > undef )
270
241
%tmp3 = getelementptr inbounds i32 , i32* %b , i32 %index
@@ -304,13 +275,12 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
304
275
; CHECK-NEXT: cmp.w r12, #0
305
276
; CHECK-NEXT: beq .LBB3_4
306
277
; CHECK-NEXT: @ %bb.1: @ %vector.ph
307
- ; CHECK-NEXT: add.w r4, r12, #3
308
- ; CHECK-NEXT: vmov.i32 q1, #0x0
309
- ; CHECK-NEXT: bic r4, r4, #3
310
- ; CHECK-NEXT: sub.w lr, r4, #4
278
+ ; CHECK-NEXT: add.w lr, r12, #3
311
279
; CHECK-NEXT: movs r4, #1
280
+ ; CHECK-NEXT: bic lr, lr, #3
281
+ ; CHECK-NEXT: vmov.i32 q1, #0x0
282
+ ; CHECK-NEXT: sub.w lr, lr, #4
312
283
; CHECK-NEXT: add.w lr, r4, lr, lsr #2
313
- ; CHECK-NEXT: movs r4, #0
314
284
; CHECK-NEXT: dls lr, lr
315
285
; CHECK-NEXT: .LBB3_2: @ %vector.body
316
286
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
@@ -326,9 +296,8 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
326
296
; CHECK-NEXT: vcmpt.i32 ne, q1, zr
327
297
; CHECK-NEXT: vldrwe.u32 q1, [r3], #16
328
298
; CHECK-NEXT: vldrwe.u32 q2, [r2], #16
329
- ; CHECK-NEXT: adds r4, #4
330
- ; CHECK-NEXT: vmul.i32 q1, q2, q1
331
299
; CHECK-NEXT: sub.w r12, r12, #4
300
+ ; CHECK-NEXT: vmul.i32 q1, q2, q1
332
301
; CHECK-NEXT: vadd.i32 q1, q1, q0
333
302
; CHECK-NEXT: le lr, .LBB3_2
334
303
; CHECK-NEXT: @ %bb.3: @ %middle.block
@@ -348,22 +317,13 @@ entry:
348
317
vector.ph: ; preds = %entry
349
318
%n.rnd.up = add i32 %N , 3
350
319
%n.vec = and i32 %n.rnd.up , -4
351
- %trip.count.minus.1 = add i32 %N , -1
352
- %broadcast.splatinsert11 = insertelement <4 x i32 > undef , i32 %trip.count.minus.1 , i32 0
353
- %broadcast.splat12 = shufflevector <4 x i32 > %broadcast.splatinsert11 , <4 x i32 > undef , <4 x i32 > zeroinitializer
354
320
br label %vector.body
355
321
356
322
vector.body: ; preds = %vector.body, %vector.ph
357
323
%index = phi i32 [ 0 , %vector.ph ], [ %index.next , %vector.body ]
358
324
%vec.phi = phi <4 x i32 > [ zeroinitializer , %vector.ph ], [ %add , %vector.body ]
359
- %broadcast.splatinsert = insertelement <4 x i32 > undef , i32 %index , i32 0
360
- %broadcast.splat = shufflevector <4 x i32 > %broadcast.splatinsert , <4 x i32 > undef , <4 x i32 > zeroinitializer
361
- %induction = add <4 x i32 > %broadcast.splat , <i32 0 , i32 1 , i32 2 , i32 3 >
362
325
%tmp = getelementptr inbounds i32 , i32* %a , i32 %index
363
-
364
- ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat12
365
326
%tmp1 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %index , i32 %N )
366
-
367
327
%tmp2 = bitcast i32* %tmp to <4 x i32 >*
368
328
%wide.masked.load.a = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp2 , i32 4 , <4 x i1 > %tmp1 , <4 x i32 > undef )
369
329
%tmp3 = getelementptr inbounds i32 , i32* %b , i32 %index
@@ -402,11 +362,9 @@ define dso_local void @continue_on_zero(i32* noalias nocapture %arg, i32* noalia
402
362
; CHECK-NEXT: it eq
403
363
; CHECK-NEXT: popeq {r7, pc}
404
364
; CHECK-NEXT: .LBB4_1: @ %bb3
405
- ; CHECK-NEXT: movs r3, #0
406
365
; CHECK-NEXT: dlstp.32 lr, r2
407
366
; CHECK-NEXT: .LBB4_2: @ %bb9
408
367
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
409
- ; CHECK-NEXT: adds r3, #4
410
368
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
411
369
; CHECK-NEXT: vpt.i32 ne, q0, zr
412
370
; CHECK-NEXT: vldrwt.u32 q1, [r0]
@@ -423,21 +381,12 @@ bb:
423
381
bb3: ; preds = %bb
424
382
%tmp4 = add i32 %arg2 , 3
425
383
%tmp5 = and i32 %tmp4 , -4
426
- %tmp6 = add i32 %arg2 , -1
427
- %tmp7 = insertelement <4 x i32 > undef , i32 %tmp6 , i32 0
428
- %tmp8 = shufflevector <4 x i32 > %tmp7 , <4 x i32 > undef , <4 x i32 > zeroinitializer
429
384
br label %bb9
430
385
431
386
bb9: ; preds = %bb9, %bb3
432
387
%tmp10 = phi i32 [ 0 , %bb3 ], [ %tmp25 , %bb9 ]
433
- %tmp11 = insertelement <4 x i32 > undef , i32 %tmp10 , i32 0
434
- %tmp12 = shufflevector <4 x i32 > %tmp11 , <4 x i32 > undef , <4 x i32 > zeroinitializer
435
- %tmp13 = add <4 x i32 > %tmp12 , <i32 0 , i32 1 , i32 2 , i32 3 >
436
388
%tmp14 = getelementptr inbounds i32 , i32* %arg1 , i32 %tmp10
437
-
438
- ; %tmp15 = icmp ule <4 x i32> %tmp13, %tmp8
439
389
%tmp15 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %tmp10 , i32 %arg2 )
440
-
441
390
%tmp16 = bitcast i32* %tmp14 to <4 x i32 >*
442
391
%tmp17 = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp16 , i32 4 , <4 x i1 > %tmp15 , <4 x i32 > undef )
443
392
%tmp18 = icmp ne <4 x i32 > %tmp17 , zeroinitializer
@@ -464,15 +413,13 @@ define dso_local arm_aapcs_vfpcc void @range_test(i32* noalias nocapture %arg, i
464
413
; CHECK-NEXT: it eq
465
414
; CHECK-NEXT: popeq {r7, pc}
466
415
; CHECK-NEXT: .LBB5_1: @ %bb4
467
- ; CHECK-NEXT: mov.w r12, #0
468
416
; CHECK-NEXT: dlstp.32 lr, r3
469
417
; CHECK-NEXT: .LBB5_2: @ %bb12
470
418
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
471
419
; CHECK-NEXT: vldrw.u32 q0, [r0]
472
420
; CHECK-NEXT: vptt.i32 ne, q0, zr
473
421
; CHECK-NEXT: vcmpt.s32 le, q0, r2
474
422
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
475
- ; CHECK-NEXT: add.w r12, r12, #4
476
423
; CHECK-NEXT: vmul.i32 q0, q1, q0
477
424
; CHECK-NEXT: vpst
478
425
; CHECK-NEXT: vstrwt.32 q0, [r0], #16
@@ -486,23 +433,14 @@ bb:
486
433
bb4: ; preds = %bb
487
434
%tmp5 = add i32 %arg3 , 3
488
435
%tmp6 = and i32 %tmp5 , -4
489
- %tmp7 = add i32 %arg3 , -1
490
- %tmp8 = insertelement <4 x i32 > undef , i32 %tmp7 , i32 0
491
- %tmp9 = shufflevector <4 x i32 > %tmp8 , <4 x i32 > undef , <4 x i32 > zeroinitializer
492
436
%tmp10 = insertelement <4 x i32 > undef , i32 %arg2 , i32 0
493
437
%tmp11 = shufflevector <4 x i32 > %tmp10 , <4 x i32 > undef , <4 x i32 > zeroinitializer
494
438
br label %bb12
495
439
496
440
bb12: ; preds = %bb12, %bb4
497
441
%tmp13 = phi i32 [ 0 , %bb4 ], [ %tmp30 , %bb12 ]
498
- %tmp14 = insertelement <4 x i32 > undef , i32 %tmp13 , i32 0
499
- %tmp15 = shufflevector <4 x i32 > %tmp14 , <4 x i32 > undef , <4 x i32 > zeroinitializer
500
- %tmp16 = add <4 x i32 > %tmp15 , <i32 0 , i32 1 , i32 2 , i32 3 >
501
442
%tmp17 = getelementptr inbounds i32 , i32* %arg , i32 %tmp13
502
-
503
- ; %tmp18 = icmp ule <4 x i32> %tmp16, %tmp9
504
443
%tmp18 = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %tmp13 , i32 %arg3 )
505
-
506
444
%tmp19 = bitcast i32* %tmp17 to <4 x i32 >*
507
445
%tmp20 = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %tmp19 , i32 4 , <4 x i1 > %tmp18 , <4 x i32 > undef )
508
446
%tmp21 = icmp ne <4 x i32 > %tmp20 , zeroinitializer
0 commit comments