@@ -301,21 +301,18 @@ define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) {
301
301
; CHECK-LABEL: transpose_s16_8x8_simpler:
302
302
; CHECK: // %bb.0: // %entry
303
303
; CHECK-NEXT: ldp q0, q1, [x0]
304
- ; CHECK-NEXT: mov x8, x0
305
304
; CHECK-NEXT: ldp q2, q3, [x0, #32]
306
305
; CHECK-NEXT: trn1 v0.8h, v0.8h, v1.8h
307
- ; CHECK-NEXT: ldp q5, q6 , [x0, #80 ]
306
+ ; CHECK-NEXT: ldp q4, q5 , [x0, #64 ]
308
307
; CHECK-NEXT: trn1 v2.8h, v2.8h, v3.8h
309
- ; CHECK-NEXT: ldr q4, [x8, #64]!
310
- ; CHECK-NEXT: ldr q1, [x0, #112]
308
+ ; CHECK-NEXT: ldp q6, q1, [x0, #96]
311
309
; CHECK-NEXT: trn1 v3.8h, v4.8h, v5.8h
312
- ; CHECK-NEXT: trn1 v1.8h, v6.8h, v1.8h
313
310
; CHECK-NEXT: trn1 v3.4s, v0.4s, v3.4s
311
+ ; CHECK-NEXT: trn1 v1.8h, v6.8h, v1.8h
314
312
; CHECK-NEXT: trn1 v4.4s, v2.4s, v1.4s
315
- ; CHECK-NEXT: ext v0.16b, v3.16b, v3.16b, #8
316
- ; CHECK-NEXT: ext v1.16b, v4.16b, v4.16b, #8
313
+ ; CHECK-NEXT: zip2 v0.4s, v3.4s, v4.4s
317
314
; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x0]
318
- ; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x8 ]
315
+ ; CHECK-NEXT: str q0, [x0, #64 ]
319
316
; CHECK-NEXT: ret
320
317
entry:
321
318
%0 = load <8 x i16 >, ptr %a , align 16
@@ -355,21 +352,18 @@ define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) {
355
352
; CHECK-LABEL: transpose_s16_8x8_simpler2:
356
353
; CHECK: // %bb.0: // %entry
357
354
; CHECK-NEXT: ldp q0, q2, [x0]
358
- ; CHECK-NEXT: mov x8, x0
359
355
; CHECK-NEXT: ldp q3, q4, [x0, #32]
360
356
; CHECK-NEXT: mov v0.h[5], v2.h[4]
361
- ; CHECK-NEXT: ldp q6, q7 , [x0, #80 ]
357
+ ; CHECK-NEXT: ldp q5, q6 , [x0, #64 ]
362
358
; CHECK-NEXT: zip1 v3.8h, v3.8h, v4.8h
363
- ; CHECK-NEXT: ldr q5, [x8, #64]!
364
- ; CHECK-NEXT: ldr q2, [x0, #112]
359
+ ; CHECK-NEXT: ldp q7, q2, [x0, #96]
365
360
; CHECK-NEXT: zip1 v4.8h, v5.8h, v6.8h
366
- ; CHECK-NEXT: mov v7.h[5], v2.h[4]
367
361
; CHECK-NEXT: mov v0.s[1], v4.s[0]
362
+ ; CHECK-NEXT: mov v7.h[5], v2.h[4]
368
363
; CHECK-NEXT: uzp1 v1.4s, v3.4s, v7.4s
369
- ; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
370
- ; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
364
+ ; CHECK-NEXT: zip2 v2.4s, v0.4s, v1.4s
371
365
; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x0]
372
- ; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x8 ]
366
+ ; CHECK-NEXT: str q2, [x0, #64 ]
373
367
; CHECK-NEXT: ret
374
368
entry:
375
369
%0 = load <8 x i16 >, ptr %a , align 16
@@ -421,33 +415,29 @@ define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %
421
415
; CHECK-NEXT: trn1 v7.8h, v3.8h, v4.8h
422
416
; CHECK-NEXT: trn2 v3.8h, v3.8h, v4.8h
423
417
; CHECK-NEXT: trn1 v4.8h, v0.8h, v6.8h
424
- ; CHECK-NEXT: trn1 v17.8h, v2.8h, v16.8h
425
418
; CHECK-NEXT: trn2 v0.8h, v0.8h, v6.8h
419
+ ; CHECK-NEXT: trn1 v17.8h, v2.8h, v16.8h
426
420
; CHECK-NEXT: trn2 v2.8h, v2.8h, v16.8h
427
421
; CHECK-NEXT: trn1 v18.4s, v5.4s, v4.4s
428
- ; CHECK-NEXT: trn1 v19.4s, v7.4s, v17.4s
429
422
; CHECK-NEXT: trn1 v20.4s, v1.4s, v0.4s
423
+ ; CHECK-NEXT: trn2 v4.4s, v5.4s, v4.4s
430
424
; CHECK-NEXT: trn2 v0.4s, v1.4s, v0.4s
425
+ ; CHECK-NEXT: trn1 v19.4s, v7.4s, v17.4s
431
426
; CHECK-NEXT: trn1 v21.4s, v3.4s, v2.4s
432
- ; CHECK-NEXT: trn2 v4.4s, v5.4s, v4.4s
433
- ; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0]
434
- ; CHECK-NEXT: trn2 v1.4s, v3.4s, v2.4s
435
- ; CHECK-NEXT: ext v2.16b, v18.16b, v18.16b, #8
436
427
; CHECK-NEXT: trn2 v5.4s, v7.4s, v17.4s
428
+ ; CHECK-NEXT: trn2 v1.4s, v3.4s, v2.4s
429
+ ; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0]
430
+ ; CHECK-NEXT: zip2 v2.4s, v18.4s, v19.4s
437
431
; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x1]
438
- ; CHECK-NEXT: ext v3.16b, v19.16b, v19.16b, #8
439
- ; CHECK-NEXT: ext v6.16b, v20.16b, v20.16b, #8
440
- ; CHECK-NEXT: ext v7.16b, v21.16b, v21.16b, #8
432
+ ; CHECK-NEXT: zip2 v3.4s, v20.4s, v21.4s
441
433
; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x2]
434
+ ; CHECK-NEXT: zip2 v4.4s, v4.4s, v5.4s
442
435
; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x3]
443
- ; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x4]
444
- ; CHECK-NEXT: ext v2.16b, v4.16b, v4.16b, #8
445
- ; CHECK-NEXT: ext v3.16b, v5.16b, v5.16b, #8
446
- ; CHECK-NEXT: st2 { v6.2s, v7.2s }, [x5]
447
- ; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
448
- ; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
449
- ; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x6]
450
- ; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x7]
436
+ ; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
437
+ ; CHECK-NEXT: str q2, [x4]
438
+ ; CHECK-NEXT: str q3, [x5]
439
+ ; CHECK-NEXT: str q4, [x6]
440
+ ; CHECK-NEXT: str q0, [x7]
451
441
; CHECK-NEXT: ret
452
442
%9 = load <8 x i16 >, ptr %0 , align 16
453
443
%10 = load <8 x i16 >, ptr %1 , align 16
@@ -505,51 +495,39 @@ define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
505
495
; CHECK: // %bb.0:
506
496
; CHECK-NEXT: mov x8, x0
507
497
; CHECK-NEXT: mov x9, x0
498
+ ; CHECK-NEXT: ldp q1, q2, [x0, #64]
508
499
; CHECK-NEXT: mov x10, x0
509
- ; CHECK-NEXT: mov x11, x0
510
- ; CHECK-NEXT: mov x12, x0
511
- ; CHECK-NEXT: mov x13, x0
512
- ; CHECK-NEXT: mov x14, x0
500
+ ; CHECK-NEXT: ldp q6, q7, [x0, #96]
501
+ ; CHECK-NEXT: trn1 v16.8h, v1.8h, v2.8h
502
+ ; CHECK-NEXT: trn2 v1.8h, v1.8h, v2.8h
513
503
; CHECK-NEXT: ldr q0, [x0]
514
- ; CHECK-NEXT: ldr q1, [x8, #16]!
515
- ; CHECK-NEXT: ldr q2, [x9, #32]!
516
- ; CHECK-NEXT: ldr q3, [x10, #48]!
517
- ; CHECK-NEXT: ldr q4, [x11, #64]!
518
- ; CHECK-NEXT: ldr q6, [x12, #80]!
519
- ; CHECK-NEXT: ldr q7, [x13, #96]!
520
- ; CHECK-NEXT: ldr q16, [x14, #112]!
521
- ; CHECK-NEXT: trn1 v5.8h, v0.8h, v1.8h
522
- ; CHECK-NEXT: trn2 v0.8h, v0.8h, v1.8h
523
- ; CHECK-NEXT: trn1 v1.8h, v2.8h, v3.8h
524
- ; CHECK-NEXT: trn2 v2.8h, v2.8h, v3.8h
525
- ; CHECK-NEXT: trn1 v3.8h, v4.8h, v6.8h
526
- ; CHECK-NEXT: trn2 v4.8h, v4.8h, v6.8h
527
- ; CHECK-NEXT: trn1 v17.8h, v7.8h, v16.8h
528
- ; CHECK-NEXT: trn2 v6.8h, v7.8h, v16.8h
529
- ; CHECK-NEXT: trn1 v18.4s, v5.4s, v3.4s
530
- ; CHECK-NEXT: trn1 v20.4s, v0.4s, v4.4s
531
- ; CHECK-NEXT: trn1 v19.4s, v1.4s, v17.4s
532
- ; CHECK-NEXT: trn1 v21.4s, v2.4s, v6.4s
533
- ; CHECK-NEXT: trn2 v22.4s, v5.4s, v3.4s
534
- ; CHECK-NEXT: trn2 v23.4s, v1.4s, v17.4s
535
- ; CHECK-NEXT: trn2 v0.4s, v0.4s, v4.4s
536
- ; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0]
537
- ; CHECK-NEXT: trn2 v1.4s, v2.4s, v6.4s
538
- ; CHECK-NEXT: ext v2.16b, v18.16b, v18.16b, #8
539
- ; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x8]
540
- ; CHECK-NEXT: ext v4.16b, v20.16b, v20.16b, #8
541
- ; CHECK-NEXT: ext v3.16b, v19.16b, v19.16b, #8
542
- ; CHECK-NEXT: st2 { v22.2s, v23.2s }, [x9]
543
- ; CHECK-NEXT: ext v5.16b, v21.16b, v21.16b, #8
504
+ ; CHECK-NEXT: ldr q3, [x8, #16]!
505
+ ; CHECK-NEXT: ldr q4, [x9, #32]!
506
+ ; CHECK-NEXT: ldr q5, [x10, #48]!
507
+ ; CHECK-NEXT: trn1 v2.8h, v6.8h, v7.8h
508
+ ; CHECK-NEXT: trn2 v6.8h, v6.8h, v7.8h
509
+ ; CHECK-NEXT: trn1 v7.8h, v0.8h, v3.8h
510
+ ; CHECK-NEXT: trn2 v0.8h, v0.8h, v3.8h
511
+ ; CHECK-NEXT: trn1 v17.8h, v4.8h, v5.8h
512
+ ; CHECK-NEXT: trn2 v3.8h, v4.8h, v5.8h
513
+ ; CHECK-NEXT: trn1 v4.4s, v7.4s, v16.4s
514
+ ; CHECK-NEXT: trn1 v18.4s, v0.4s, v1.4s
515
+ ; CHECK-NEXT: trn2 v20.4s, v7.4s, v16.4s
516
+ ; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s
517
+ ; CHECK-NEXT: trn1 v5.4s, v17.4s, v2.4s
518
+ ; CHECK-NEXT: trn1 v19.4s, v3.4s, v6.4s
519
+ ; CHECK-NEXT: trn2 v21.4s, v17.4s, v2.4s
520
+ ; CHECK-NEXT: trn2 v1.4s, v3.4s, v6.4s
521
+ ; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x0]
522
+ ; CHECK-NEXT: zip2 v2.4s, v4.4s, v5.4s
523
+ ; CHECK-NEXT: zip2 v3.4s, v18.4s, v19.4s
524
+ ; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x8]
525
+ ; CHECK-NEXT: zip2 v4.4s, v20.4s, v21.4s
544
526
; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x10]
545
- ; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x11]
546
- ; CHECK-NEXT: ext v2.16b, v22.16b, v22.16b, #8
547
- ; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x12]
548
- ; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
549
- ; CHECK-NEXT: ext v3.16b, v23.16b, v23.16b, #8
550
- ; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
551
- ; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x13]
552
- ; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x14]
527
+ ; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
528
+ ; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x9]
529
+ ; CHECK-NEXT: stp q2, q3, [x0, #64]
530
+ ; CHECK-NEXT: stp q4, q0, [x0, #96]
553
531
; CHECK-NEXT: ret
554
532
%2 = load <8 x i16 >, ptr %0 , align 16
555
533
%3 = getelementptr inbounds <8 x i16 >, ptr %0 , i64 1
@@ -629,11 +607,10 @@ define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a
629
607
; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s
630
608
; CHECK-NEXT: trn1 v0.4s, v1.4s, v0.4s
631
609
; CHECK-NEXT: zip1 v1.4s, v2.4s, v0.4s
632
- ; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8
633
- ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
634
610
; CHECK-NEXT: trn1 v1.4s, v1.4s, v0.4s
611
+ ; CHECK-NEXT: zip2 v0.4s, v2.4s, v0.4s
635
612
; CHECK-NEXT: str q1, [x0]
636
- ; CHECK-NEXT: st2 { v2.2s, v3.2s } , [x1]
613
+ ; CHECK-NEXT: str q0 , [x1]
637
614
; CHECK-NEXT: ret
638
615
%v0 = shufflevector <4 x i32 > %a0 , <4 x i32 > %a1 , <4 x i32 > <i32 0 , i32 4 , i32 2 , i32 6 >
639
616
%v1 = shufflevector <4 x i32 > %a1 , <4 x i32 > %a0 , <4 x i32 > <i32 0 , i32 4 , i32 2 , i32 6 >
@@ -648,11 +625,10 @@ define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %
648
625
; CHECK-LABEL: store_factor2_high2:
649
626
; CHECK: // %bb.0:
650
627
; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s
651
- ; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
652
- ; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8
653
- ; CHECK-NEXT: trn1 v0.4s, v2.4s, v1.4s
654
- ; CHECK-NEXT: str q0, [x0]
655
- ; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x1]
628
+ ; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
629
+ ; CHECK-NEXT: trn1 v2.4s, v2.4s, v1.4s
630
+ ; CHECK-NEXT: str q2, [x0]
631
+ ; CHECK-NEXT: str q0, [x1]
656
632
; CHECK-NEXT: ret
657
633
%interleaved.vec = shufflevector <4 x i32 > %a0 , <4 x i32 > %a1 , <4 x i32 > <i32 0 , i32 4 , i32 1 , i32 6 >
658
634
%interleaved.vec2 = shufflevector <4 x i32 > %a0 , <4 x i32 > %a1 , <4 x i32 > <i32 2 , i32 6 , i32 3 , i32 7 >
0 commit comments