Skip to content

Commit 8bca87a

Browse files
committed
[AArch64] Don't generate st2 for 64bit store that can use stp
D142966 made it so that st2 that do not start at element 0 use zip2 instead of st2. This extends that to any 64bit store that has a nearby load that can better become a LDP operation, which is expected to have a higher throughput. It searches up to 20 instructions away for a store to p+16 or p-16.
1 parent ab261eb commit 8bca87a

File tree

3 files changed

+67
-52
lines changed

3 files changed

+67
-52
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 32 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15236,6 +15236,29 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
1523615236
return true;
1523715237
}
1523815238

15239+
template <typename Iter>
15240+
bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
15241+
int MaxLookupDist = 20;
15242+
unsigned IdxWidth = DL.getIndexSizeInBits(0);
15243+
APInt OffsetA(IdxWidth, 0), OffsetB(IdxWidth, 0);
15244+
const Value *PtrA1 =
15245+
Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
15246+
15247+
while (++It != End && !It->isDebugOrPseudoInst() && MaxLookupDist-- > 0) {
15248+
if (const auto *SI = dyn_cast<StoreInst>(&*It)) {
15249+
const Value *PtrB1 =
15250+
SI->getPointerOperand()->stripAndAccumulateInBoundsConstantOffsets(
15251+
DL, OffsetB);
15252+
if (PtrA1 == PtrB1 &&
15253+
(OffsetA.sextOrTrunc(IdxWidth) - OffsetB.sextOrTrunc(IdxWidth))
15254+
.abs() == 16)
15255+
return true;
15256+
}
15257+
}
15258+
15259+
return false;
15260+
}
15261+
1523915262
/// Lower an interleaved store into a stN intrinsic.
1524015263
///
1524115264
/// E.g. Lower an interleaved store (Factor = 3):
@@ -15327,8 +15350,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
1532715350
return false;
1532815351
}
1532915352
// A 64bit st2 which does not start at element 0 will involved adding extra
15330-
// ext elements, making the st2 unprofitable.
15331-
if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 && Mask[0] != 0)
15353+
// ext elements making the st2 unprofitable, and if there is a nearby store
15354+
// that points to BaseAddr+16 or BaseAddr-16 then it can be better left as a
15355+
// zip;ldp pair which has higher throughput.
15356+
if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 &&
15357+
(Mask[0] != 0 ||
15358+
hasNearbyPairedStore(SI->getIterator(), SI->getParent()->end(), BaseAddr,
15359+
DL) ||
15360+
hasNearbyPairedStore(SI->getReverseIterator(), SI->getParent()->rend(),
15361+
BaseAddr, DL)))
1533215362
return false;
1533315363

1533415364
Type *PtrTy = SI->getPointerOperandType();

llvm/test/CodeGen/AArch64/machine-cse-profitable-check.ll

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,26 +1,14 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2-
; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK-BASE
3-
; RUN: llc -mtriple aarch64-none-linux-gnu -aggressive-machine-cse < %s | FileCheck %s --check-prefixes=CHECK-AGGRESSIVE-CSE
2+
; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s
43

54
define void @foo(ptr %buf, <8 x i16> %a) {
6-
; CHECK-BASE-LABEL: foo:
7-
; CHECK-BASE: // %bb.0: // %entry
8-
; CHECK-BASE-NEXT: movi v2.2d, #0000000000000000
9-
; CHECK-BASE-NEXT: // kill: def $q0 killed $q0 def $q0_q1
10-
; CHECK-BASE-NEXT: zip2 v2.8h, v0.8h, v2.8h
11-
; CHECK-BASE-NEXT: movi v1.2d, #0000000000000000
12-
; CHECK-BASE-NEXT: st2 { v0.4h, v1.4h }, [x0], #16
13-
; CHECK-BASE-NEXT: str q2, [x0]
14-
; CHECK-BASE-NEXT: ret
15-
;
16-
; CHECK-AGGRESSIVE-CSE-LABEL: foo:
17-
; CHECK-AGGRESSIVE-CSE: // %bb.0: // %entry
18-
; CHECK-AGGRESSIVE-CSE-NEXT: // kill: def $q0 killed $q0 def $q0_q1
19-
; CHECK-AGGRESSIVE-CSE-NEXT: movi v1.2d, #0000000000000000
20-
; CHECK-AGGRESSIVE-CSE-NEXT: zip2 v2.8h, v0.8h, v1.8h
21-
; CHECK-AGGRESSIVE-CSE-NEXT: st2 { v0.4h, v1.4h }, [x0], #16
22-
; CHECK-AGGRESSIVE-CSE-NEXT: str q2, [x0]
23-
; CHECK-AGGRESSIVE-CSE-NEXT: ret
5+
; CHECK-LABEL: foo:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: movi v1.2d, #0000000000000000
8+
; CHECK-NEXT: zip1 v2.8h, v0.8h, v1.8h
9+
; CHECK-NEXT: zip2 v0.8h, v0.8h, v1.8h
10+
; CHECK-NEXT: stp q2, q0, [x0]
11+
; CHECK-NEXT: ret
2412
entry:
2513
%vzip.i = shufflevector <8 x i16> %a, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 poison, i16 poison, i16 poison, i16 poison>, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
2614
%vzip1.i = shufflevector <8 x i16> %a, <8 x i16> <i16 poison, i16 poison, i16 poison, i16 poison, i16 0, i16 0, i16 0, i16 0>, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>

llvm/test/CodeGen/AArch64/vldn_shuffle.ll

Lines changed: 27 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -493,41 +493,38 @@ define void @transpose_s16_8x8(ptr nocapture noundef %0, ptr nocapture noundef %
493493
define void @transpose_s16_8x8_(ptr nocapture noundef %0) {
494494
; CHECK-LABEL: transpose_s16_8x8_:
495495
; CHECK: // %bb.0:
496-
; CHECK-NEXT: mov x8, x0
496+
; CHECK-NEXT: ldp q0, q1, [x0]
497+
; CHECK-NEXT: ldp q2, q3, [x0, #32]
497498
; CHECK-NEXT: ldp q4, q5, [x0, #64]
498-
; CHECK-NEXT: mov x9, x0
499-
; CHECK-NEXT: ldr q0, [x8, #16]!
500-
; CHECK-NEXT: mov x10, x0
501-
; CHECK-NEXT: ldr q3, [x0]
502499
; CHECK-NEXT: ldp q6, q7, [x0, #96]
500+
; CHECK-NEXT: trn1 v16.8h, v0.8h, v1.8h
501+
; CHECK-NEXT: trn2 v0.8h, v0.8h, v1.8h
502+
; CHECK-NEXT: trn1 v1.8h, v2.8h, v3.8h
503+
; CHECK-NEXT: trn2 v2.8h, v2.8h, v3.8h
503504
; CHECK-NEXT: trn1 v17.8h, v4.8h, v5.8h
504-
; CHECK-NEXT: ldr q1, [x9, #32]!
505-
; CHECK-NEXT: trn1 v16.8h, v3.8h, v0.8h
506-
; CHECK-NEXT: ldr q2, [x10, #48]!
507-
; CHECK-NEXT: trn2 v4.8h, v4.8h, v5.8h
508-
; CHECK-NEXT: trn1 v19.8h, v6.8h, v7.8h
509-
; CHECK-NEXT: trn2 v0.8h, v3.8h, v0.8h
510-
; CHECK-NEXT: trn2 v3.8h, v6.8h, v7.8h
511-
; CHECK-NEXT: trn1 v18.8h, v1.8h, v2.8h
512-
; CHECK-NEXT: trn2 v1.8h, v1.8h, v2.8h
505+
; CHECK-NEXT: trn2 v3.8h, v4.8h, v5.8h
506+
; CHECK-NEXT: trn1 v18.8h, v6.8h, v7.8h
507+
; CHECK-NEXT: trn2 v4.8h, v6.8h, v7.8h
513508
; CHECK-NEXT: trn1 v5.4s, v16.4s, v17.4s
509+
; CHECK-NEXT: trn1 v7.4s, v0.4s, v3.4s
514510
; CHECK-NEXT: trn2 v16.4s, v16.4s, v17.4s
515-
; CHECK-NEXT: trn1 v20.4s, v0.4s, v4.4s
516-
; CHECK-NEXT: trn1 v6.4s, v18.4s, v19.4s
517-
; CHECK-NEXT: trn2 v17.4s, v18.4s, v19.4s
518-
; CHECK-NEXT: trn2 v18.4s, v0.4s, v4.4s
519-
; CHECK-NEXT: trn1 v21.4s, v1.4s, v3.4s
520-
; CHECK-NEXT: trn2 v19.4s, v1.4s, v3.4s
521-
; CHECK-NEXT: zip2 v0.4s, v5.4s, v6.4s
522-
; CHECK-NEXT: zip2 v2.4s, v16.4s, v17.4s
523-
; CHECK-NEXT: st2 { v5.2s, v6.2s }, [x0]
524-
; CHECK-NEXT: zip2 v1.4s, v20.4s, v21.4s
525-
; CHECK-NEXT: zip2 v3.4s, v18.4s, v19.4s
526-
; CHECK-NEXT: st2 { v20.2s, v21.2s }, [x8]
527-
; CHECK-NEXT: st2 { v16.2s, v17.2s }, [x9]
528-
; CHECK-NEXT: st2 { v18.2s, v19.2s }, [x10]
529-
; CHECK-NEXT: stp q0, q1, [x0, #64]
530-
; CHECK-NEXT: stp q2, q3, [x0, #96]
511+
; CHECK-NEXT: trn1 v6.4s, v1.4s, v18.4s
512+
; CHECK-NEXT: trn1 v19.4s, v2.4s, v4.4s
513+
; CHECK-NEXT: trn2 v1.4s, v1.4s, v18.4s
514+
; CHECK-NEXT: trn2 v0.4s, v0.4s, v3.4s
515+
; CHECK-NEXT: trn2 v2.4s, v2.4s, v4.4s
516+
; CHECK-NEXT: zip1 v3.4s, v5.4s, v6.4s
517+
; CHECK-NEXT: zip1 v4.4s, v7.4s, v19.4s
518+
; CHECK-NEXT: zip1 v17.4s, v16.4s, v1.4s
519+
; CHECK-NEXT: zip1 v18.4s, v0.4s, v2.4s
520+
; CHECK-NEXT: zip2 v5.4s, v5.4s, v6.4s
521+
; CHECK-NEXT: zip2 v1.4s, v16.4s, v1.4s
522+
; CHECK-NEXT: zip2 v0.4s, v0.4s, v2.4s
523+
; CHECK-NEXT: stp q3, q4, [x0]
524+
; CHECK-NEXT: zip2 v3.4s, v7.4s, v19.4s
525+
; CHECK-NEXT: stp q17, q18, [x0, #32]
526+
; CHECK-NEXT: stp q1, q0, [x0, #96]
527+
; CHECK-NEXT: stp q5, q3, [x0, #64]
531528
; CHECK-NEXT: ret
532529
%2 = load <8 x i16>, ptr %0, align 16
533530
%3 = getelementptr inbounds <8 x i16>, ptr %0, i64 1

0 commit comments

Comments
 (0)