Skip to content

Commit d96ea46

Browse files
committed
[AArch64LoadStoreOptimizer] Generate more STPs by renaming registers earlier
Our initial motivating case was memcpy's with alignments > 16. The loads/stores, to which small memcpy's expand, are kept together in several places so that we get a sequence like this for a 64 bit copy: LD w0 LD w1 ST w0 ST w1 The load/store optimiser can generate a LDP/STP w0, w1 from this because the registers read/written are consecutive. In our case however, the sequence is optimised during ISel, resulting in: LD w0 ST w0 LD w0 ST w0 This instruction reordering allows reuse of registers. Since the registers are no longer consecutive (i.e. they are the same), it inhibits LDP/STP creation. The approach here is to perform renaming: LD w0 ST w0 LD w1 ST w1 to enable the folding of the stores into a STP. We do not yet generate the LDP due to a limitation in the renaming implementation, but plan to look at that in a follow-up so that we fully support this case. While this was initially motivated by certain memcpy's, this is a general approach and thus is beneficial for other cases too, as can be seen in some test changes. Differential Revision: https://reviews.llvm.org/D103597
1 parent 502edeb commit d96ea46

File tree

5 files changed

+257
-48
lines changed

5 files changed

+257
-48
lines changed

llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp

Lines changed: 52 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1515,6 +1515,32 @@ static Optional<MCPhysReg> tryToFindRegisterToRename(
15151515
return None;
15161516
}
15171517

1518+
// Returns a boolean that represents whether there exists a register
1519+
// from FirstMI to the beginning of the block that can be renamed. If
1520+
// one exists, we update Flags with its value.
1521+
static bool updateFlagsWithRenameReg(
1522+
Optional<bool> MaybeCanRename, LdStPairFlags &Flags, MachineInstr &FirstMI,
1523+
MachineInstr &MI, LiveRegUnits &DefinedInBB, LiveRegUnits &UsedInBetween,
1524+
SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
1525+
const TargetRegisterInfo *TRI) {
1526+
if (DebugCounter::shouldExecute(RegRenamingCounter)) {
1527+
if (!MaybeCanRename)
1528+
MaybeCanRename = {
1529+
canRenameUpToDef(FirstMI, UsedInBetween, RequiredClasses, TRI)};
1530+
1531+
if (*MaybeCanRename) {
1532+
Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename(
1533+
FirstMI, MI, DefinedInBB, UsedInBetween, RequiredClasses, TRI);
1534+
if (MaybeRenameReg) {
1535+
Flags.setRenameReg(*MaybeRenameReg);
1536+
Flags.setMergeForward(true);
1537+
return true;
1538+
}
1539+
}
1540+
}
1541+
return false;
1542+
}
1543+
15181544
/// Scan the instructions looking for a load/store that can be combined with the
15191545
/// current instruction into a wider equivalent or a load/store pair.
15201546
MachineBasicBlock::iterator
@@ -1666,6 +1692,27 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
16661692
continue;
16671693
}
16681694
}
1695+
// If the load/store pattern has been optimized and reordered
1696+
// into the following:
1697+
// ldr q0, [x1, #16]
1698+
// str q0, [x0, #16]
1699+
// ldr q0, [x1]
1700+
// str q0, [x0]
1701+
// and the destination register of the load/store instruction is
1702+
// the same register as or a sub/super register of the other
1703+
// load/store, it will not generate an LDP/STP, so we attempt to
1704+
// rename the register so that it can be recognised as a pair.
1705+
// TODO: This is currently supported for STPs, LDPs are not
1706+
// being generated yet.
1707+
if (TRI->isSuperOrSubRegisterEq(Reg, getLdStRegOp(MI).getReg())) {
1708+
bool flagsHaveRenameReg = updateFlagsWithRenameReg(
1709+
MaybeCanRename, Flags, FirstMI, MI, DefinedInBB, UsedInBetween,
1710+
RequiredClasses, TRI);
1711+
if (flagsHaveRenameReg) {
1712+
MBBIWithRenameReg = MBBI;
1713+
continue;
1714+
}
1715+
}
16691716
// If the destination register of one load is the same register or a
16701717
// sub/super register of the other load, bail and keep looking. A
16711718
// load-pair instruction with both destination registers the same is
@@ -1714,22 +1761,11 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
17141761
Flags.clearRenameReg();
17151762
return MBBI;
17161763
}
1717-
1718-
if (DebugCounter::shouldExecute(RegRenamingCounter)) {
1719-
if (!MaybeCanRename)
1720-
MaybeCanRename = {canRenameUpToDef(FirstMI, UsedInBetween,
1721-
RequiredClasses, TRI)};
1722-
1723-
if (*MaybeCanRename) {
1724-
Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename(
1725-
FirstMI, MI, DefinedInBB, UsedInBetween, RequiredClasses,
1726-
TRI);
1727-
if (MaybeRenameReg) {
1728-
Flags.setRenameReg(*MaybeRenameReg);
1729-
Flags.setMergeForward(true);
1730-
MBBIWithRenameReg = MBBI;
1731-
}
1732-
}
1764+
bool flagsHaveRenameReg = updateFlagsWithRenameReg(
1765+
MaybeCanRename, Flags, FirstMI, MI, DefinedInBB, UsedInBetween,
1766+
RequiredClasses, TRI);
1767+
if (flagsHaveRenameReg) {
1768+
MBBIWithRenameReg = MBBI;
17331769
}
17341770
}
17351771
// Unable to combine these instructions due to interference in between.

llvm/test/CodeGen/AArch64/GlobalISel/byval-call.ll

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -33,38 +33,30 @@ define void @call_byval_a64i32([64 x i32]* %incoming) {
3333
; CHECK-NEXT: .cfi_offset w28, -16
3434
; CHECK-NEXT: .cfi_offset w30, -24
3535
; CHECK-NEXT: .cfi_offset w29, -32
36-
; CHECK-NEXT: ldr q0, [x0]
37-
; CHECK-NEXT: str q0, [sp]
36+
; CHECK-NEXT: ldr q1, [x0]
3837
; CHECK-NEXT: ldr q0, [x0, #16]
39-
; CHECK-NEXT: str q0, [sp, #16]
40-
; CHECK-NEXT: ldr q0, [x0, #32]
41-
; CHECK-NEXT: str q0, [sp, #32]
38+
; CHECK-NEXT: stp q1, q0, [sp]
39+
; CHECK-NEXT: ldr q1, [x0, #32]
4240
; CHECK-NEXT: ldr q0, [x0, #48]
43-
; CHECK-NEXT: str q0, [sp, #48]
44-
; CHECK-NEXT: ldr q0, [x0, #64]
45-
; CHECK-NEXT: str q0, [sp, #64]
41+
; CHECK-NEXT: stp q1, q0, [sp, #32]
42+
; CHECK-NEXT: ldr q1, [x0, #64]
4643
; CHECK-NEXT: ldr q0, [x0, #80]
47-
; CHECK-NEXT: str q0, [sp, #80]
48-
; CHECK-NEXT: ldr q0, [x0, #96]
49-
; CHECK-NEXT: str q0, [sp, #96]
44+
; CHECK-NEXT: stp q1, q0, [sp, #64]
45+
; CHECK-NEXT: ldr q1, [x0, #96]
5046
; CHECK-NEXT: ldr q0, [x0, #112]
51-
; CHECK-NEXT: str q0, [sp, #112]
52-
; CHECK-NEXT: ldr q0, [x0, #128]
53-
; CHECK-NEXT: str q0, [sp, #128]
47+
; CHECK-NEXT: stp q1, q0, [sp, #96]
48+
; CHECK-NEXT: ldr q1, [x0, #128]
5449
; CHECK-NEXT: ldr q0, [x0, #144]
55-
; CHECK-NEXT: str q0, [sp, #144]
56-
; CHECK-NEXT: ldr q0, [x0, #160]
57-
; CHECK-NEXT: str q0, [sp, #160]
50+
; CHECK-NEXT: stp q1, q0, [sp, #128]
51+
; CHECK-NEXT: ldr q1, [x0, #160]
5852
; CHECK-NEXT: ldr q0, [x0, #176]
59-
; CHECK-NEXT: str q0, [sp, #176]
60-
; CHECK-NEXT: ldr q0, [x0, #192]
61-
; CHECK-NEXT: str q0, [sp, #192]
53+
; CHECK-NEXT: stp q1, q0, [sp, #160]
54+
; CHECK-NEXT: ldr q1, [x0, #192]
6255
; CHECK-NEXT: ldr q0, [x0, #208]
63-
; CHECK-NEXT: str q0, [sp, #208]
64-
; CHECK-NEXT: ldr q0, [x0, #224]
65-
; CHECK-NEXT: str q0, [sp, #224]
56+
; CHECK-NEXT: stp q1, q0, [sp, #192]
57+
; CHECK-NEXT: ldr q1, [x0, #224]
6658
; CHECK-NEXT: ldr q0, [x0, #240]
67-
; CHECK-NEXT: str q0, [sp, #240]
59+
; CHECK-NEXT: stp q1, q0, [sp, #224]
6860
; CHECK-NEXT: bl byval_a64i32
6961
; CHECK-NEXT: ldr x28, [sp, #272] // 8-byte Folded Reload
7062
; CHECK-NEXT: ldp x29, x30, [sp, #256] // 16-byte Folded Reload

llvm/test/CodeGen/AArch64/consthoist-gep.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
; CHECK-NOT: adrp x10, global+332
44
; CHECK-NOT: add x10, x10, :lo12:global+332
55
; CHECK: adrp x10, global+528
6+
; CHECK-NEXT: and w12, w8, #0xffffff
7+
; CHECK-NEXT: ldr w8, [x11]
68
; CHECK-NEXT: add x10, x10, :lo12:global+528
79

810
%struct.blam = type { %struct.bar, %struct.bar.0, %struct.wobble, %struct.wombat, i8, i16, %struct.snork.2, %struct.foo, %struct.snork.3, %struct.wobble.4, %struct.quux, [9 x i16], %struct.spam, %struct.zot }

llvm/test/CodeGen/AArch64/ldst-opt.ll

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,7 +1117,7 @@ define void @store-pair-post-indexed-double() nounwind {
11171117
define void @post-indexed-sub-word(i32* %a, i32* %b, i64 %count) nounwind {
11181118
; CHECK-LABEL: post-indexed-sub-word
11191119
; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}], #-8
1120-
; CHECK: str w{{[0-9]+}}, [x{{[0-9]+}}], #-8
1120+
; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #-4]
11211121
br label %for.body
11221122
for.body:
11231123
%phi1 = phi i32* [ %gep4, %for.body ], [ %b, %0 ]
@@ -1141,7 +1141,7 @@ end:
11411141
define void @post-indexed-sub-doubleword(i64* %a, i64* %b, i64 %count) nounwind {
11421142
; CHECK-LABEL: post-indexed-sub-doubleword
11431143
; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}], #-16
1144-
; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #-16
1144+
; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #-8]
11451145
br label %for.body
11461146
for.body:
11471147
%phi1 = phi i64* [ %gep4, %for.body ], [ %b, %0 ]
@@ -1165,7 +1165,7 @@ end:
11651165
define void @post-indexed-sub-quadword(<2 x i64>* %a, <2 x i64>* %b, i64 %count) nounwind {
11661166
; CHECK-LABEL: post-indexed-sub-quadword
11671167
; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}], #-32
1168-
; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}], #-32
1168+
; CHECK: stp q{{[0-9]+}}, q{{[0-9]+}}, [x0, #-16]
11691169
br label %for.body
11701170
for.body:
11711171
%phi1 = phi <2 x i64>* [ %gep4, %for.body ], [ %b, %0 ]
@@ -1189,7 +1189,7 @@ end:
11891189
define void @post-indexed-sub-float(float* %a, float* %b, i64 %count) nounwind {
11901190
; CHECK-LABEL: post-indexed-sub-float
11911191
; CHECK: ldr s{{[0-9]+}}, [x{{[0-9]+}}], #-8
1192-
; CHECK: str s{{[0-9]+}}, [x{{[0-9]+}}], #-8
1192+
; CHECK: stp s{{[0-9]+}}, s{{[0-9]+}}, [x0, #-4]
11931193
br label %for.body
11941194
for.body:
11951195
%phi1 = phi float* [ %gep4, %for.body ], [ %b, %0 ]
@@ -1213,7 +1213,7 @@ end:
12131213
define void @post-indexed-sub-double(double* %a, double* %b, i64 %count) nounwind {
12141214
; CHECK-LABEL: post-indexed-sub-double
12151215
; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}], #-16
1216-
; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}], #-16
1216+
; CHECK: stp d{{[0-9]+}}, d{{[0-9]+}}, [x0, #-8]
12171217
br label %for.body
12181218
for.body:
12191219
%phi1 = phi double* [ %gep4, %for.body ], [ %b, %0 ]
@@ -1237,7 +1237,7 @@ end:
12371237
define void @post-indexed-sub-doubleword-offset-min(i64* %a, i64* %b, i64 %count) nounwind {
12381238
; CHECK-LABEL: post-indexed-sub-doubleword-offset-min
12391239
; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}], #-256
1240-
; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}], #-256
1240+
; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [x0], #-256
12411241
br label %for.body
12421242
for.body:
12431243
%phi1 = phi i64* [ %gep4, %for.body ], [ %b, %0 ]
@@ -1262,8 +1262,7 @@ define void @post-indexed-doubleword-offset-out-of-range(i64* %a, i64* %b, i64 %
12621262
; CHECK-LABEL: post-indexed-doubleword-offset-out-of-range
12631263
; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}]
12641264
; CHECK: add x{{[0-9]+}}, x{{[0-9]+}}, #256
1265-
; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}]
1266-
; CHECK: add x{{[0-9]+}}, x{{[0-9]+}}, #256
1265+
; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [x0], #256
12671266

12681267
br label %for.body
12691268
for.body:

0 commit comments

Comments
 (0)