@@ -225,6 +225,163 @@ for.cond.cleanup: ; preds = %for.body, %middle.b
225
225
ret void
226
226
}
227
227
228
+ define void @shl (i32* nocapture readonly %x , i32* noalias nocapture %y , i32 %n ) {
229
+ ; CHECK-LABEL: shl:
230
+ ; CHECK: @ %bb.0: @ %entry
231
+ ; CHECK-NEXT: .save {r7, lr}
232
+ ; CHECK-NEXT: push {r7, lr}
233
+ ; CHECK-NEXT: cmp r2, #1
234
+ ; CHECK-NEXT: it lt
235
+ ; CHECK-NEXT: poplt {r7, pc}
236
+ ; CHECK-NEXT: .LBB4_1: @ %vector.ph
237
+ ; CHECK-NEXT: adr r3, .LCPI4_0
238
+ ; CHECK-NEXT: vldrw.u32 q0, [r3]
239
+ ; CHECK-NEXT: vmov.i32 q1, #0x4
240
+ ; CHECK-NEXT: dlstp.32 lr, r2
241
+ ; CHECK-NEXT: .LBB4_2: @ %vector.body
242
+ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
243
+ ; CHECK-NEXT: vshl.i32 q3, q0, #2
244
+ ; CHECK-NEXT: vadd.i32 q0, q0, q1
245
+ ; CHECK-NEXT: vldrw.u32 q2, [r0], #16
246
+ ; CHECK-NEXT: vstrw.32 q2, [r1, q3, uxtw #2]
247
+ ; CHECK-NEXT: letp lr, .LBB4_2
248
+ ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
249
+ ; CHECK-NEXT: pop {r7, pc}
250
+ ; CHECK-NEXT: .p2align 4
251
+ ; CHECK-NEXT: @ %bb.4:
252
+ ; CHECK-NEXT: .LCPI4_0:
253
+ ; CHECK-NEXT: .long 0 @ 0x0
254
+ ; CHECK-NEXT: .long 1 @ 0x1
255
+ ; CHECK-NEXT: .long 2 @ 0x2
256
+ ; CHECK-NEXT: .long 3 @ 0x3
257
+ entry:
258
+ %cmp6 = icmp sgt i32 %n , 0
259
+ br i1 %cmp6 , label %vector.ph , label %for.cond.cleanup
260
+
261
+ vector.ph: ; preds = %entry
262
+ %n.rnd.up = add i32 %n , 3
263
+ %n.vec = and i32 %n.rnd.up , -4
264
+ br label %vector.body
265
+
266
+ vector.body: ; preds = %vector.body, %vector.ph
267
+ %index = phi i32 [ 0 , %vector.ph ], [ %index.next , %vector.body ]
268
+ %vec.ind = phi <4 x i32 > [ <i32 0 , i32 1 , i32 2 , i32 3 >, %vector.ph ], [ %vec.ind.next , %vector.body ]
269
+ %active.lane.mask = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %index , i32 %n )
270
+ %0 = getelementptr inbounds i32 , i32* %x , i32 %index
271
+ %1 = bitcast i32* %0 to <4 x i32 >*
272
+ %wide.masked.load = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %1 , i32 4 , <4 x i1 > %active.lane.mask , <4 x i32 > poison)
273
+ %2 = shl nsw <4 x i32 > %vec.ind , <i32 2 , i32 2 , i32 2 , i32 2 >
274
+ %3 = getelementptr inbounds i32 , i32* %y , <4 x i32 > %2
275
+ call void @llvm.masked.scatter.v4i32.v4p0i32 (<4 x i32 > %wide.masked.load , <4 x i32* > %3 , i32 4 , <4 x i1 > %active.lane.mask )
276
+ %index.next = add i32 %index , 4
277
+ %vec.ind.next = add <4 x i32 > %vec.ind , <i32 4 , i32 4 , i32 4 , i32 4 >
278
+ %4 = icmp eq i32 %index.next , %n.vec
279
+ br i1 %4 , label %for.cond.cleanup , label %vector.body
280
+
281
+ for.cond.cleanup: ; preds = %vector.body, %entry
282
+ ret void
283
+ }
284
+
285
+ define void @shlor (i32* nocapture readonly %x , i32* noalias nocapture %y , i32 %n ) {
286
+ ; CHECK-LABEL: shlor:
287
+ ; CHECK: @ %bb.0: @ %entry
288
+ ; CHECK-NEXT: .save {r7, lr}
289
+ ; CHECK-NEXT: push {r7, lr}
290
+ ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
291
+ ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
292
+ ; CHECK-NEXT: .pad #80
293
+ ; CHECK-NEXT: sub sp, #80
294
+ ; CHECK-NEXT: cmp r2, #1
295
+ ; CHECK-NEXT: blt .LBB5_3
296
+ ; CHECK-NEXT: @ %bb.1: @ %vector.ph
297
+ ; CHECK-NEXT: vmov.i32 q1, #0x1
298
+ ; CHECK-NEXT: adr r3, .LCPI5_0
299
+ ; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill
300
+ ; CHECK-NEXT: vmov.i32 q1, #0x3
301
+ ; CHECK-NEXT: vldrw.u32 q0, [r3]
302
+ ; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill
303
+ ; CHECK-NEXT: vmov.i32 q1, #0x2
304
+ ; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill
305
+ ; CHECK-NEXT: vmov.i32 q1, #0x4
306
+ ; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill
307
+ ; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload
308
+ ; CHECK-NEXT: dlstp.32 lr, r2
309
+ ; CHECK-NEXT: .LBB5_2: @ %vector.body
310
+ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
311
+ ; CHECK-NEXT: vldrw.u32 q5, [r0], #16
312
+ ; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload
313
+ ; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload
314
+ ; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload
315
+ ; CHECK-NEXT: vshl.i32 q7, q0, #3
316
+ ; CHECK-NEXT: vadd.i32 q1, q5, q6
317
+ ; CHECK-NEXT: vadd.i32 q2, q5, q2
318
+ ; CHECK-NEXT: vadd.i32 q3, q5, q3
319
+ ; CHECK-NEXT: vadd.i32 q5, q5, q4
320
+ ; CHECK-NEXT: vmov q4, q7
321
+ ; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill
322
+ ; CHECK-NEXT: vmov q1, q7
323
+ ; CHECK-NEXT: vstrw.32 q5, [r1, q7, uxtw #2]
324
+ ; CHECK-NEXT: vadd.i32 q0, q0, q6
325
+ ; CHECK-NEXT: vorr.i32 q4, #0x4
326
+ ; CHECK-NEXT: vorr.i32 q7, #0x2
327
+ ; CHECK-NEXT: vstrw.32 q3, [r1, q7, uxtw #2]
328
+ ; CHECK-NEXT: vstrw.32 q2, [r1, q4, uxtw #2]
329
+ ; CHECK-NEXT: vorr.i32 q1, #0x6
330
+ ; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
331
+ ; CHECK-NEXT: vstrw.32 q2, [r1, q1, uxtw #2]
332
+ ; CHECK-NEXT: letp lr, .LBB5_2
333
+ ; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup
334
+ ; CHECK-NEXT: add sp, #80
335
+ ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
336
+ ; CHECK-NEXT: pop {r7, pc}
337
+ ; CHECK-NEXT: .p2align 4
338
+ ; CHECK-NEXT: @ %bb.4:
339
+ ; CHECK-NEXT: .LCPI5_0:
340
+ ; CHECK-NEXT: .long 0 @ 0x0
341
+ ; CHECK-NEXT: .long 1 @ 0x1
342
+ ; CHECK-NEXT: .long 2 @ 0x2
343
+ ; CHECK-NEXT: .long 3 @ 0x3
344
+ entry:
345
+ %cmp33 = icmp sgt i32 %n , 0
346
+ br i1 %cmp33 , label %vector.ph , label %for.cond.cleanup
347
+
348
+ vector.ph: ; preds = %entry
349
+ %n.rnd.up = add i32 %n , 3
350
+ %n.vec = and i32 %n.rnd.up , -4
351
+ br label %vector.body
352
+
353
+ vector.body: ; preds = %vector.body, %vector.ph
354
+ %index = phi i32 [ 0 , %vector.ph ], [ %index.next , %vector.body ]
355
+ %vec.ind = phi <4 x i32 > [ <i32 0 , i32 1 , i32 2 , i32 3 >, %vector.ph ], [ %vec.ind.next , %vector.body ]
356
+ %active.lane.mask = call <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 %index , i32 %n )
357
+ %0 = getelementptr inbounds i32 , i32* %x , i32 %index
358
+ %1 = bitcast i32* %0 to <4 x i32 >*
359
+ %wide.masked.load = call <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >* %1 , i32 4 , <4 x i1 > %active.lane.mask , <4 x i32 > poison)
360
+ %2 = add nsw <4 x i32 > %wide.masked.load , <i32 1 , i32 1 , i32 1 , i32 1 >
361
+ %3 = shl nsw <4 x i32 > %vec.ind , <i32 3 , i32 3 , i32 3 , i32 3 >
362
+ %4 = getelementptr inbounds i32 , i32* %y , <4 x i32 > %3
363
+ call void @llvm.masked.scatter.v4i32.v4p0i32 (<4 x i32 > %2 , <4 x i32* > %4 , i32 4 , <4 x i1 > %active.lane.mask )
364
+ %5 = add nsw <4 x i32 > %wide.masked.load , <i32 2 , i32 2 , i32 2 , i32 2 >
365
+ %6 = or <4 x i32 > %3 , <i32 2 , i32 2 , i32 2 , i32 2 >
366
+ %7 = getelementptr inbounds i32 , i32* %y , <4 x i32 > %6
367
+ call void @llvm.masked.scatter.v4i32.v4p0i32 (<4 x i32 > %5 , <4 x i32* > %7 , i32 4 , <4 x i1 > %active.lane.mask )
368
+ %8 = add nsw <4 x i32 > %wide.masked.load , <i32 3 , i32 3 , i32 3 , i32 3 >
369
+ %9 = or <4 x i32 > %3 , <i32 4 , i32 4 , i32 4 , i32 4 >
370
+ %10 = getelementptr inbounds i32 , i32* %y , <4 x i32 > %9
371
+ call void @llvm.masked.scatter.v4i32.v4p0i32 (<4 x i32 > %8 , <4 x i32* > %10 , i32 4 , <4 x i1 > %active.lane.mask )
372
+ %11 = add nsw <4 x i32 > %wide.masked.load , <i32 4 , i32 4 , i32 4 , i32 4 >
373
+ %12 = or <4 x i32 > %3 , <i32 6 , i32 6 , i32 6 , i32 6 >
374
+ %13 = getelementptr inbounds i32 , i32* %y , <4 x i32 > %12
375
+ call void @llvm.masked.scatter.v4i32.v4p0i32 (<4 x i32 > %11 , <4 x i32* > %13 , i32 4 , <4 x i1 > %active.lane.mask )
376
+ %index.next = add i32 %index , 4
377
+ %vec.ind.next = add <4 x i32 > %vec.ind , <i32 4 , i32 4 , i32 4 , i32 4 >
378
+ %14 = icmp eq i32 %index.next , %n.vec
379
+ br i1 %14 , label %for.cond.cleanup , label %vector.body
380
+
381
+ for.cond.cleanup: ; preds = %vector.body, %entry
382
+ ret void
383
+ }
384
+
228
385
declare void @llvm.masked.scatter.v8i8.v8p0i8 (<8 x i8 >, <8 x i8* >, i32 , <8 x i1 >)
229
386
declare void @llvm.masked.scatter.v8i16.v8p0i16 (<8 x i16 >, <8 x i16* >, i32 , <8 x i1 >)
230
387
declare void @llvm.masked.scatter.v8f16.v8p0f16 (<8 x half >, <8 x half *>, i32 , <8 x i1 >)
@@ -234,3 +391,5 @@ declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x
234
391
declare void @llvm.masked.scatter.v4f16.v4p0f16 (<4 x half >, <4 x half *>, i32 , <4 x i1 >)
235
392
declare void @llvm.masked.scatter.v4i32.v4p0i32 (<4 x i32 >, <4 x i32* >, i32 , <4 x i1 >)
236
393
declare void @llvm.masked.scatter.v4f32.v4p0f32 (<4 x float >, <4 x float *>, i32 , <4 x i1 >)
394
+ declare <4 x i1 > @llvm.get.active.lane.mask.v4i1.i32 (i32 , i32 )
395
+ declare <4 x i32 > @llvm.masked.load.v4i32.p0v4i32 (<4 x i32 >*, i32 , <4 x i1 >, <4 x i32 >)
0 commit comments