Skip to content

Commit c855e92

Browse files
committed
[AArch64] Place the first ldp at the end when ReverseCSRRestoreSeq is true
Put the first ldp at the end, so that the load-store optimizer can run and merge the ldp and the add into a post-index ldp. This didn't work in case no frame was needed and resulted in code size regressions. llvm-svn: 331044
1 parent ffb8d87 commit c855e92

File tree

2 files changed

+78
-30
lines changed

2 files changed

+78
-30
lines changed

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 45 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,38 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
514514
OffsetOpnd.setImm(OffsetOpnd.getImm() + LocalStackSize / 8);
515515
}
516516

517+
static void adaptForLdStOpt(MachineBasicBlock &MBB,
518+
MachineBasicBlock::iterator FirstSPPopI,
519+
MachineBasicBlock::iterator LastPopI) {
520+
// Sometimes (when we restore in the same order as we save), we can end up
521+
// with code like this:
522+
//
523+
// ldp x26, x25, [sp]
524+
// ldp x24, x23, [sp, #16]
525+
// ldp x22, x21, [sp, #32]
526+
// ldp x20, x19, [sp, #48]
527+
// add sp, sp, #64
528+
//
529+
// In this case, it is always better to put the first ldp at the end, so
530+
// that the load-store optimizer can run and merge the ldp and the add into
531+
// a post-index ldp.
532+
// If we managed to grab the first pop instruction, move it to the end.
533+
if (ReverseCSRRestoreSeq)
534+
MBB.splice(FirstSPPopI, &MBB, LastPopI);
535+
// We should end up with something like this now:
536+
//
537+
// ldp x24, x23, [sp, #16]
538+
// ldp x22, x21, [sp, #32]
539+
// ldp x20, x19, [sp, #48]
540+
// ldp x26, x25, [sp]
541+
// add sp, sp, #64
542+
//
543+
// and the load-store optimizer can merge the last two instructions into:
544+
//
545+
// ldp x26, x25, [sp], #64
546+
//
547+
}
548+
517549
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
518550
MachineBasicBlock &MBB) const {
519551
MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -930,12 +962,20 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
930962
int StackRestoreBytes = RedZone ? 0 : NumBytes;
931963
if (NoCalleeSaveRestore)
932964
StackRestoreBytes += AfterCSRPopSize;
933-
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
934-
StackRestoreBytes, TII, MachineInstr::FrameDestroy);
965+
935966
// If we were able to combine the local stack pop with the argument pop,
936967
// then we're done.
937-
if (NoCalleeSaveRestore || AfterCSRPopSize == 0)
968+
bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0;
969+
970+
// If we're done after this, make sure to help the load store optimizer.
971+
if (Done)
972+
adaptForLdStOpt(MBB, MBB.getFirstTerminator(), LastPopI);
973+
974+
emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
975+
StackRestoreBytes, TII, MachineInstr::FrameDestroy);
976+
if (Done)
938977
return;
978+
939979
NumBytes = 0;
940980
}
941981

@@ -967,33 +1007,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
9671007
FirstSPPopI = Prev;
9681008
}
9691009

970-
// Sometimes (when we restore in the same order as we save), we can end up
971-
// with code like this:
972-
//
973-
// ldp x26, x25, [sp]
974-
// ldp x24, x23, [sp, #16]
975-
// ldp x22, x21, [sp, #32]
976-
// ldp x20, x19, [sp, #48]
977-
// add sp, sp, #64
978-
//
979-
// In this case, it is always better to put the first ldp at the end, so
980-
// that the load-store optimizer can run and merge the ldp and the add into
981-
// a post-index ldp.
982-
// If we managed to grab the first pop instruction, move it to the end.
983-
if (LastPopI != Begin)
984-
MBB.splice(FirstSPPopI, &MBB, LastPopI);
985-
// We should end up with something like this now:
986-
//
987-
// ldp x24, x23, [sp, #16]
988-
// ldp x22, x21, [sp, #32]
989-
// ldp x20, x19, [sp, #48]
990-
// ldp x26, x25, [sp]
991-
// add sp, sp, #64
992-
//
993-
// and the load-store optimizer can merge the last two instructions into:
994-
//
995-
// ldp x26, x25, [sp], #64
996-
//
1010+
adaptForLdStOpt(MBB, FirstSPPopI, LastPopI);
1011+
9971012
emitFrameOffset(MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
9981013
AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
9991014
}

llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77

88
define void @bar() nounwind { entry: unreachable }
99

10+
define void @baz() nounwind { entry: unreachable }
11+
1012
...
1113
---
1214
name: foo
@@ -71,3 +73,34 @@ body: |
7173
; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0
7274
RET_ReallyLR
7375
...
76+
---
77+
# Check that the load from the offset 0 is moved at the end even when hasFP is
78+
# false.
79+
name: baz
80+
# CHECK-LABEL: name: baz
81+
alignment: 2
82+
tracksRegLiveness: true
83+
frameInfo:
84+
adjustsStack: true
85+
hasCalls: true
86+
body: |
87+
bb.0:
88+
successors: %bb.1
89+
90+
$x0 = IMPLICIT_DEF
91+
$x20 = IMPLICIT_DEF
92+
$x21 = IMPLICIT_DEF
93+
94+
ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
95+
BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $x0
96+
ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
97+
B %bb.1
98+
99+
bb.1:
100+
; CHECK: $x20, $lr = frame-destroy LDPXi $sp, 2
101+
; BEFORELDSTOPT-NEXT: $x21 = frame-destroy LDRXui $sp, 0
102+
; BEFORELDSTOPT-NEXT: $sp = frame-destroy ADDXri $sp, 32, 0
103+
104+
; AFTERLDSTOPT-NEXT: early-clobber $sp, $x21 = frame-destroy LDRXpost $sp, 32
105+
RET_ReallyLR
106+
...

0 commit comments

Comments
 (0)