@@ -514,6 +514,38 @@ static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
514
514
OffsetOpnd.setImm (OffsetOpnd.getImm () + LocalStackSize / 8 );
515
515
}
516
516
517
+ static void adaptForLdStOpt (MachineBasicBlock &MBB,
518
+ MachineBasicBlock::iterator FirstSPPopI,
519
+ MachineBasicBlock::iterator LastPopI) {
520
+ // Sometimes (when we restore in the same order as we save), we can end up
521
+ // with code like this:
522
+ //
523
+ // ldp x26, x25, [sp]
524
+ // ldp x24, x23, [sp, #16]
525
+ // ldp x22, x21, [sp, #32]
526
+ // ldp x20, x19, [sp, #48]
527
+ // add sp, sp, #64
528
+ //
529
+ // In this case, it is always better to put the first ldp at the end, so
530
+ // that the load-store optimizer can run and merge the ldp and the add into
531
+ // a post-index ldp.
532
+ // If we managed to grab the first pop instruction, move it to the end.
533
+ if (ReverseCSRRestoreSeq)
534
+ MBB.splice (FirstSPPopI, &MBB, LastPopI);
535
+ // We should end up with something like this now:
536
+ //
537
+ // ldp x24, x23, [sp, #16]
538
+ // ldp x22, x21, [sp, #32]
539
+ // ldp x20, x19, [sp, #48]
540
+ // ldp x26, x25, [sp]
541
+ // add sp, sp, #64
542
+ //
543
+ // and the load-store optimizer can merge the last two instructions into:
544
+ //
545
+ // ldp x26, x25, [sp], #64
546
+ //
547
+ }
548
+
517
549
void AArch64FrameLowering::emitPrologue (MachineFunction &MF,
518
550
MachineBasicBlock &MBB) const {
519
551
MachineBasicBlock::iterator MBBI = MBB.begin ();
@@ -930,12 +962,20 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
930
962
int StackRestoreBytes = RedZone ? 0 : NumBytes;
931
963
if (NoCalleeSaveRestore)
932
964
StackRestoreBytes += AfterCSRPopSize;
933
- emitFrameOffset (MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
934
- StackRestoreBytes, TII, MachineInstr::FrameDestroy);
965
+
935
966
// If we were able to combine the local stack pop with the argument pop,
936
967
// then we're done.
937
- if (NoCalleeSaveRestore || AfterCSRPopSize == 0 )
968
+ bool Done = NoCalleeSaveRestore || AfterCSRPopSize == 0 ;
969
+
970
+ // If we're done after this, make sure to help the load store optimizer.
971
+ if (Done)
972
+ adaptForLdStOpt (MBB, MBB.getFirstTerminator (), LastPopI);
973
+
974
+ emitFrameOffset (MBB, LastPopI, DL, AArch64::SP, AArch64::SP,
975
+ StackRestoreBytes, TII, MachineInstr::FrameDestroy);
976
+ if (Done)
938
977
return ;
978
+
939
979
NumBytes = 0 ;
940
980
}
941
981
@@ -967,33 +1007,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
967
1007
FirstSPPopI = Prev;
968
1008
}
969
1009
970
- // Sometimes (when we restore in the same order as we save), we can end up
971
- // with code like this:
972
- //
973
- // ldp x26, x25, [sp]
974
- // ldp x24, x23, [sp, #16]
975
- // ldp x22, x21, [sp, #32]
976
- // ldp x20, x19, [sp, #48]
977
- // add sp, sp, #64
978
- //
979
- // In this case, it is always better to put the first ldp at the end, so
980
- // that the load-store optimizer can run and merge the ldp and the add into
981
- // a post-index ldp.
982
- // If we managed to grab the first pop instruction, move it to the end.
983
- if (LastPopI != Begin)
984
- MBB.splice (FirstSPPopI, &MBB, LastPopI);
985
- // We should end up with something like this now:
986
- //
987
- // ldp x24, x23, [sp, #16]
988
- // ldp x22, x21, [sp, #32]
989
- // ldp x20, x19, [sp, #48]
990
- // ldp x26, x25, [sp]
991
- // add sp, sp, #64
992
- //
993
- // and the load-store optimizer can merge the last two instructions into:
994
- //
995
- // ldp x26, x25, [sp], #64
996
- //
1010
+ adaptForLdStOpt (MBB, FirstSPPopI, LastPopI);
1011
+
997
1012
emitFrameOffset (MBB, FirstSPPopI, DL, AArch64::SP, AArch64::SP,
998
1013
AfterCSRPopSize, TII, MachineInstr::FrameDestroy);
999
1014
}
0 commit comments