Skip to content

Commit 120e968

Browse files
committed
[AArch64] Don't create ST2 for 64bit store that requires an EXT
A 64bit st2 which does not start at element 0 will involved adding extra ext elements, making the st2 unprofitable. This prevents that case which can lead to a few less instructions. Differential Revision: https://reviews.llvm.org/D142966
1 parent 58927e9 commit 120e968

File tree

2 files changed

+63
-83
lines changed

2 files changed

+63
-83
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14626,6 +14626,10 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
1462614626
if (llvm::all_of(Mask, [](int Idx) { return Idx == UndefMaskElem; })) {
1462714627
return false;
1462814628
}
14629+
// A 64bit st2 which does not start at element 0 will involved adding extra
14630+
// ext elements, making the st2 unprofitable.
14631+
if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 && Mask[0] != 0)
14632+
return false;
1462914633

1463014634
Type *PtrTy =
1463114635
UseScalable

llvm/test/CodeGen/AArch64/vldn_shuffle.ll

Lines changed: 59 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -301,21 +301,18 @@ define void @transpose_s16_8x8_simpler(ptr nocapture noundef %a) {
301301
; CHECK-LABEL: transpose_s16_8x8_simpler:
302302
; CHECK: // %bb.0: // %entry
303303
; CHECK-NEXT: ldp q0, q1, [x0]
304-
; CHECK-NEXT: mov x8, x0
305304
; CHECK-NEXT: ldp q2, q3, [x0, #32]
306305
; CHECK-NEXT: trn1 v0.8h, v0.8h, v1.8h
307-
; CHECK-NEXT: ldp q5, q6, [x0, #80]
306+
; CHECK-NEXT: ldp q4, q5, [x0, #64]
308307
; CHECK-NEXT: trn1 v2.8h, v2.8h, v3.8h
309-
; CHECK-NEXT: ldr q4, [x8, #64]!
310-
; CHECK-NEXT: ldr q1, [x0, #112]
308+
; CHECK-NEXT: ldp q6, q1, [x0, #96]
311309
; CHECK-NEXT: trn1 v3.8h, v4.8h, v5.8h
312-
; CHECK-NEXT: trn1 v1.8h, v6.8h, v1.8h
313310
; CHECK-NEXT: trn1 v3.4s, v0.4s, v3.4s
311+
; CHECK-NEXT: trn1 v1.8h, v6.8h, v1.8h
314312
; CHECK-NEXT: trn1 v4.4s, v2.4s, v1.4s
315-
; CHECK-NEXT: ext v0.16b, v3.16b, v3.16b, #8
316-
; CHECK-NEXT: ext v1.16b, v4.16b, v4.16b, #8
313+
; CHECK-NEXT: zip2 v0.4s, v3.4s, v4.4s
317314
; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x0]
318-
; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x8]
315+
; CHECK-NEXT: str q0, [x0, #64]
319316
; CHECK-NEXT: ret
320317
entry:
321318
%0 = load <8 x i16>, ptr %a, align 16
@@ -355,21 +352,18 @@ define void @transpose_s16_8x8_simpler2(ptr nocapture noundef %a) {
355352
; CHECK-LABEL: transpose_s16_8x8_simpler2:
356353
; CHECK: // %bb.0: // %entry
357354
; CHECK-NEXT: ldp q0, q2, [x0]
358-
; CHECK-NEXT: mov x8, x0
359355
; CHECK-NEXT: ldp q3, q4, [x0, #32]
360356
; CHECK-NEXT: mov v0.h[5], v2.h[4]
361-
; CHECK-NEXT: ldp q6, q7, [x0, #80]
357+
; CHECK-NEXT: ldp q5, q6, [x0, #64]
362358
; CHECK-NEXT: zip1 v3.8h, v3.8h, v4.8h
363-
; CHECK-NEXT: ldr q5, [x8, #64]!
364-
; CHECK-NEXT: ldr q2, [x0, #112]
359+
; CHECK-NEXT: ldp q7, q2, [x0, #96]
365360
; CHECK-NEXT: zip1 v4.8h, v5.8h, v6.8h
366-
; CHECK-NEXT: mov v7.h[5], v2.h[4]
367361
; CHECK-NEXT: mov v0.s[1], v4.s[0]
362+
; CHECK-NEXT: mov v7.h[5], v2.h[4]
368363
; CHECK-NEXT: uzp1 v1.4s, v3.4s, v7.4s
369-
; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
370-
; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
364+
; CHECK-NEXT: zip2 v2.4s, v0.4s, v1.4s
371365
; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x0]
372-
; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x8]
366+
; CHECK-NEXT: str q2, [x0, #64]
373367
; CHECK-NEXT: ret
374368
entry:
375369
%0 = load <8 x i16>, ptr %a, align 16
@@ -421,33 +415,29 @@ define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %
421415
; CHECK-NEXT: trn1 v7.8h, v3.8h, v4.8h
422416
; CHECK-NEXT: trn2 v3.8h, v3.8h, v4.8h
423417
; CHECK-NEXT: trn1 v4.8h, v0.8h, v6.8h
424-
; CHECK-NEXT: trn1 v17.8h, v2.8h, v16.8h
425418
; CHECK-NEXT: trn2 v0.8h, v0.8h, v6.8h
419+
; CHECK-NEXT: trn1 v17.8h, v2.8h, v16.8h
426420
; CHECK-NEXT: trn2 v2.8h, v2.8h, v16.8h
427421
; CHECK-NEXT: trn1 v18.4s, v5.4s, v4.4s
428-
; CHECK-NEXT: trn1 v19.4s, v7.4s, v17.4s
429422
; CHECK-NEXT: trn1 v20.4s, v1.4s, v0.4s
423+
; CHECK-NEXT: trn2 v4.4s, v5.4s, v4.4s
430424
; CHECK-NEXT: trn2 v0.4s, v1.4s, v0.4s
425+
; CHECK-NEXT: trn1 v19.4s, v7.4s, v17.4s
431426
; CHECK-NEXT: trn1 v21.4s, v3.4s, v2.4s
432-
; CHECK-NEXT: trn2 v4.4s, v5.4s, v4.4s
433-
; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0]
434-
; CHECK-NEXT: trn2 v1.4s, v3.4s, v2.4s
435-
; CHECK-NEXT: ext v2.16b, v18.16b, v18.16b, #8
436427
; CHECK-NEXT: trn2 v5.4s, v7.4s, v17.4s
428+
; CHECK-NEXT: trn2 v1.4s, v3.4s, v2.4s
429+
; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0]
430+
; CHECK-NEXT: zip2 v2.4s, v18.4s, v19.4s
437431
; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x1]
438-
; CHECK-NEXT: ext v3.16b, v19.16b, v19.16b, #8
439-
; CHECK-NEXT: ext v6.16b, v20.16b, v20.16b, #8
440-
; CHECK-NEXT: ext v7.16b, v21.16b, v21.16b, #8
432+
; CHECK-NEXT: zip2 v3.4s, v20.4s, v21.4s
441433
; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x2]
434+
; CHECK-NEXT: zip2 v4.4s, v4.4s, v5.4s
442435
; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x3]
443-
; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x4]
444-
; CHECK-NEXT: ext v2.16b, v4.16b, v4.16b, #8
445-
; CHECK-NEXT: ext v3.16b, v5.16b, v5.16b, #8
446-
; CHECK-NEXT: st2 { v6.2s, v7.2s }, [x5]
447-
; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
448-
; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
449-
; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x6]
450-
; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x7]
436+
; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
437+
; CHECK-NEXT: str q2, [x4]
438+
; CHECK-NEXT: str q3, [x5]
439+
; CHECK-NEXT: str q4, [x6]
440+
; CHECK-NEXT: str q0, [x7]
451441
; CHECK-NEXT: ret
452442
%9 = load <8 x i16>, ptr %0, align 16
453443
%10 = load <8 x i16>, ptr %1, align 16
@@ -505,51 +495,39 @@ define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
505495
; CHECK: // %bb.0:
506496
; CHECK-NEXT: mov x8, x0
507497
; CHECK-NEXT: mov x9, x0
498+
; CHECK-NEXT: ldp q1, q2, [x0, #64]
508499
; CHECK-NEXT: mov x10, x0
509-
; CHECK-NEXT: mov x11, x0
510-
; CHECK-NEXT: mov x12, x0
511-
; CHECK-NEXT: mov x13, x0
512-
; CHECK-NEXT: mov x14, x0
500+
; CHECK-NEXT: ldp q6, q7, [x0, #96]
501+
; CHECK-NEXT: trn1 v16.8h, v1.8h, v2.8h
502+
; CHECK-NEXT: trn2 v1.8h, v1.8h, v2.8h
513503
; CHECK-NEXT: ldr q0, [x0]
514-
; CHECK-NEXT: ldr q1, [x8, #16]!
515-
; CHECK-NEXT: ldr q2, [x9, #32]!
516-
; CHECK-NEXT: ldr q3, [x10, #48]!
517-
; CHECK-NEXT: ldr q4, [x11, #64]!
518-
; CHECK-NEXT: ldr q6, [x12, #80]!
519-
; CHECK-NEXT: ldr q7, [x13, #96]!
520-
; CHECK-NEXT: ldr q16, [x14, #112]!
521-
; CHECK-NEXT: trn1 v5.8h, v0.8h, v1.8h
522-
; CHECK-NEXT: trn2 v0.8h, v0.8h, v1.8h
523-
; CHECK-NEXT: trn1 v1.8h, v2.8h, v3.8h
524-
; CHECK-NEXT: trn2 v2.8h, v2.8h, v3.8h
525-
; CHECK-NEXT: trn1 v3.8h, v4.8h, v6.8h
526-
; CHECK-NEXT: trn2 v4.8h, v4.8h, v6.8h
527-
; CHECK-NEXT: trn1 v17.8h, v7.8h, v16.8h
528-
; CHECK-NEXT: trn2 v6.8h, v7.8h, v16.8h
529-
; CHECK-NEXT: trn1 v18.4s, v5.4s, v3.4s
530-
; CHECK-NEXT: trn1 v20.4s, v0.4s, v4.4s
531-
; CHECK-NEXT: trn1 v19.4s, v1.4s, v17.4s
532-
; CHECK-NEXT: trn1 v21.4s, v2.4s, v6.4s
533-
; CHECK-NEXT: trn2 v22.4s, v5.4s, v3.4s
534-
; CHECK-NEXT: trn2 v23.4s, v1.4s, v17.4s
535-
; CHECK-NEXT: trn2 v0.4s, v0.4s, v4.4s
536-
; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x0]
537-
; CHECK-NEXT: trn2 v1.4s, v2.4s, v6.4s
538-
; CHECK-NEXT: ext v2.16b, v18.16b, v18.16b, #8
539-
; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x8]
540-
; CHECK-NEXT: ext v4.16b, v20.16b, v20.16b, #8
541-
; CHECK-NEXT: ext v3.16b, v19.16b, v19.16b, #8
542-
; CHECK-NEXT: st2 { v22.2s, v23.2s }, [x9]
543-
; CHECK-NEXT: ext v5.16b, v21.16b, v21.16b, #8
504+
; CHECK-NEXT: ldr q3, [x8, #16]!
505+
; CHECK-NEXT: ldr q4, [x9, #32]!
506+
; CHECK-NEXT: ldr q5, [x10, #48]!
507+
; CHECK-NEXT: trn1 v2.8h, v6.8h, v7.8h
508+
; CHECK-NEXT: trn2 v6.8h, v6.8h, v7.8h
509+
; CHECK-NEXT: trn1 v7.8h, v0.8h, v3.8h
510+
; CHECK-NEXT: trn2 v0.8h, v0.8h, v3.8h
511+
; CHECK-NEXT: trn1 v17.8h, v4.8h, v5.8h
512+
; CHECK-NEXT: trn2 v3.8h, v4.8h, v5.8h
513+
; CHECK-NEXT: trn1 v4.4s, v7.4s, v16.4s
514+
; CHECK-NEXT: trn1 v18.4s, v0.4s, v1.4s
515+
; CHECK-NEXT: trn2 v20.4s, v7.4s, v16.4s
516+
; CHECK-NEXT: trn2 v0.4s, v0.4s, v1.4s
517+
; CHECK-NEXT: trn1 v5.4s, v17.4s, v2.4s
518+
; CHECK-NEXT: trn1 v19.4s, v3.4s, v6.4s
519+
; CHECK-NEXT: trn2 v21.4s, v17.4s, v2.4s
520+
; CHECK-NEXT: trn2 v1.4s, v3.4s, v6.4s
521+
; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x0]
522+
; CHECK-NEXT: zip2 v2.4s, v4.4s, v5.4s
523+
; CHECK-NEXT: zip2 v3.4s, v18.4s, v19.4s
524+
; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x8]
525+
; CHECK-NEXT: zip2 v4.4s, v20.4s, v21.4s
544526
; CHECK-NEXT: st2 { v0.2s, v1.2s }, [x10]
545-
; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x11]
546-
; CHECK-NEXT: ext v2.16b, v22.16b, v22.16b, #8
547-
; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x12]
548-
; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
549-
; CHECK-NEXT: ext v3.16b, v23.16b, v23.16b, #8
550-
; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
551-
; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x13]
552-
; CHECK-NEXT: st2 { v4.2s, v5.2s }, [x14]
527+
; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
528+
; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x9]
529+
; CHECK-NEXT: stp q2, q3, [x0, #64]
530+
; CHECK-NEXT: stp q4, q0, [x0, #96]
553531
; CHECK-NEXT: ret
554532
%2 = load <8 x i16>, ptr %0, align 16
555533
%3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1
@@ -629,11 +607,10 @@ define void @store_factor2_high(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %a
629607
; CHECK-NEXT: trn1 v2.4s, v0.4s, v1.4s
630608
; CHECK-NEXT: trn1 v0.4s, v1.4s, v0.4s
631609
; CHECK-NEXT: zip1 v1.4s, v2.4s, v0.4s
632-
; CHECK-NEXT: ext v2.16b, v2.16b, v2.16b, #8
633-
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
634610
; CHECK-NEXT: trn1 v1.4s, v1.4s, v0.4s
611+
; CHECK-NEXT: zip2 v0.4s, v2.4s, v0.4s
635612
; CHECK-NEXT: str q1, [x0]
636-
; CHECK-NEXT: st2 { v2.2s, v3.2s }, [x1]
613+
; CHECK-NEXT: str q0, [x1]
637614
; CHECK-NEXT: ret
638615
%v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
639616
%v1 = shufflevector <4 x i32> %a1, <4 x i32> %a0, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
@@ -648,11 +625,10 @@ define void @store_factor2_high2(ptr %ptr, ptr %ptr2, <4 x i32> %a0, <4 x i32> %
648625
; CHECK-LABEL: store_factor2_high2:
649626
; CHECK: // %bb.0:
650627
; CHECK-NEXT: zip1 v2.4s, v0.4s, v1.4s
651-
; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #8
652-
; CHECK-NEXT: ext v4.16b, v1.16b, v1.16b, #8
653-
; CHECK-NEXT: trn1 v0.4s, v2.4s, v1.4s
654-
; CHECK-NEXT: str q0, [x0]
655-
; CHECK-NEXT: st2 { v3.2s, v4.2s }, [x1]
628+
; CHECK-NEXT: zip2 v0.4s, v0.4s, v1.4s
629+
; CHECK-NEXT: trn1 v2.4s, v2.4s, v1.4s
630+
; CHECK-NEXT: str q2, [x0]
631+
; CHECK-NEXT: str q0, [x1]
656632
; CHECK-NEXT: ret
657633
%interleaved.vec = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 0, i32 4, i32 1, i32 6>
658634
%interleaved.vec2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <4 x i32> <i32 2, i32 6, i32 3, i32 7>

0 commit comments

Comments
 (0)