Skip to content

Commit 0b83065

Browse files
committed
[llvm][X86] Fix merging of large sp updates
1 parent d9af03b commit 0b83065

File tree

4 files changed

+156
-51
lines changed

4 files changed

+156
-51
lines changed

llvm/lib/Target/X86/X86ExpandPseudo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
284284
// Adjust stack pointer.
285285
int StackAdj = StackAdjust.getImm();
286286
int MaxTCDelta = X86FI->getTCReturnAddrDelta();
287-
int Offset = 0;
287+
int64_t Offset = 0;
288288
assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");
289289

290290
// Incoporate the retaddr area.
@@ -297,7 +297,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
297297

298298
if (Offset) {
299299
// Check for possible merge with preceding ADD instruction.
300-
Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
300+
Offset = X86FL->mergeSPAdd(MBB, MBBI, Offset, true);
301301
X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true);
302302
}
303303

llvm/lib/Target/X86/X86FrameLowering.cpp

Lines changed: 78 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,8 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
223223
return false;
224224
}
225225

226+
constexpr int64_t MaxSPChunk = (1LL << 31) - 1;
227+
226228
/// emitSPUpdate - Emit a series of instructions to increment / decrement the
227229
/// stack pointer by a constant value.
228230
void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
@@ -242,7 +244,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
242244
return;
243245
}
244246

245-
uint64_t Chunk = (1LL << 31) - 1;
247+
uint64_t Chunk = MaxSPChunk;
246248

247249
MachineFunction &MF = *MBB.getParent();
248250
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
@@ -391,12 +393,15 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
391393
return MI;
392394
}
393395

394-
int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
395-
MachineBasicBlock::iterator &MBBI,
396-
bool doMergeWithPrevious) const {
396+
template <typename FoundT, typename CalcT>
397+
int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
398+
MachineBasicBlock::iterator &MBBI,
399+
FoundT FoundStackAdjust,
400+
CalcT CalcNewOffset,
401+
bool doMergeWithPrevious) const {
397402
if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
398403
(!doMergeWithPrevious && MBBI == MBB.end()))
399-
return 0;
404+
return CalcNewOffset(0);
400405

401406
MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
402407

@@ -415,27 +420,38 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
415420
if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())
416421
PI = std::prev(PI);
417422

418-
unsigned Opc = PI->getOpcode();
419-
int Offset = 0;
420-
421-
if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
422-
PI->getOperand(0).getReg() == StackPtr) {
423-
assert(PI->getOperand(1).getReg() == StackPtr);
424-
Offset = PI->getOperand(2).getImm();
425-
} else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
426-
PI->getOperand(0).getReg() == StackPtr &&
427-
PI->getOperand(1).getReg() == StackPtr &&
428-
PI->getOperand(2).getImm() == 1 &&
429-
PI->getOperand(3).getReg() == X86::NoRegister &&
430-
PI->getOperand(5).getReg() == X86::NoRegister) {
431-
// For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
432-
Offset = PI->getOperand(4).getImm();
433-
} else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&
434-
PI->getOperand(0).getReg() == StackPtr) {
435-
assert(PI->getOperand(1).getReg() == StackPtr);
436-
Offset = -PI->getOperand(2).getImm();
437-
} else
438-
return 0;
423+
int64_t Offset = 0;
424+
for (;;) {
425+
unsigned Opc = PI->getOpcode();
426+
427+
if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
428+
PI->getOperand(0).getReg() == StackPtr) {
429+
assert(PI->getOperand(1).getReg() == StackPtr);
430+
Offset = PI->getOperand(2).getImm();
431+
} else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
432+
PI->getOperand(0).getReg() == StackPtr &&
433+
PI->getOperand(1).getReg() == StackPtr &&
434+
PI->getOperand(2).getImm() == 1 &&
435+
PI->getOperand(3).getReg() == X86::NoRegister &&
436+
PI->getOperand(5).getReg() == X86::NoRegister) {
437+
// For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
438+
Offset = PI->getOperand(4).getImm();
439+
} else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&
440+
PI->getOperand(0).getReg() == StackPtr) {
441+
assert(PI->getOperand(1).getReg() == StackPtr);
442+
Offset = -PI->getOperand(2).getImm();
443+
} else
444+
return CalcNewOffset(0);
445+
446+
FoundStackAdjust(PI, Offset);
447+
if (std::abs((int64_t)CalcNewOffset(Offset)) < MaxSPChunk)
448+
break;
449+
450+
if (doMergeWithPrevious ? (PI == MBB.begin()) : (PI == MBB.end()))
451+
return CalcNewOffset(0);
452+
453+
PI = doMergeWithPrevious ? std::prev(PI) : std::next(PI);
454+
}
439455

440456
PI = MBB.erase(PI);
441457
if (PI != MBB.end() && PI->isCFIInstruction()) {
@@ -448,7 +464,16 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
448464
if (!doMergeWithPrevious)
449465
MBBI = skipDebugInstructionsForward(PI, MBB.end());
450466

451-
return Offset;
467+
return CalcNewOffset(Offset);
468+
}
469+
470+
int64_t X86FrameLowering::mergeSPAdd(MachineBasicBlock &MBB,
471+
MachineBasicBlock::iterator &MBBI,
472+
int64_t AddOffset,
473+
bool doMergeWithPrevious) const {
474+
return mergeSPUpdates(
475+
MBB, MBBI, [AddOffset](int64_t Offset) { return AddOffset + Offset; },
476+
doMergeWithPrevious);
452477
}
453478

454479
void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
@@ -1975,8 +2000,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
19752000

19762001
// If there is an SUB32ri of ESP immediately before this instruction, merge
19772002
// the two. This can be the case when tail call elimination is enabled and
1978-
// the callee has more arguments then the caller.
1979-
NumBytes -= mergeSPUpdates(MBB, MBBI, true);
2003+
// the callee has more arguments than the caller.
2004+
NumBytes = mergeSPUpdates(
2005+
MBB, MBBI, [NumBytes](int64_t Offset) { return NumBytes - Offset; },
2006+
true);
19802007

19812008
// Adjust stack pointer: ESP -= numbytes.
19822009

@@ -2457,7 +2484,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
24572484
if (HasFP) {
24582485
if (X86FI->hasSwiftAsyncContext()) {
24592486
// Discard the context.
2460-
int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
2487+
int64_t Offset = mergeSPAdd(MBB, MBBI, 16, true);
24612488
emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
24622489
}
24632490
// Pop EBP.
@@ -2531,7 +2558,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
25312558
// If there is an ADD32ri or SUB32ri of ESP immediately before this
25322559
// instruction, merge the two instructions.
25332560
if (NumBytes || MFI.hasVarSizedObjects())
2534-
NumBytes += mergeSPUpdates(MBB, MBBI, true);
2561+
NumBytes = mergeSPAdd(MBB, MBBI, NumBytes, true);
25352562

25362563
// If dynamic alloca is used, then reset esp to point to the last callee-saved
25372564
// slot before popping them off! Same applies for the case, when stack was
@@ -2612,11 +2639,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
26122639

26132640
if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
26142641
// Add the return addr area delta back since we are not tail calling.
2615-
int Offset = -1 * X86FI->getTCReturnAddrDelta();
2642+
int64_t Offset = -1 * X86FI->getTCReturnAddrDelta();
26162643
assert(Offset >= 0 && "TCDelta should never be positive");
26172644
if (Offset) {
26182645
// Check for possible merge with preceding ADD instruction.
2619-
Offset += mergeSPUpdates(MBB, Terminator, true);
2646+
Offset = mergeSPAdd(MBB, Terminator, Offset, true);
26202647
emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
26212648
}
26222649
}
@@ -3814,13 +3841,24 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
38143841

38153842
// Add Amount to SP to destroy a frame, or subtract to setup.
38163843
int64_t StackAdjustment = isDestroy ? Amount : -Amount;
3844+
int64_t CfaAdjustment = StackAdjustment;
38173845

38183846
if (StackAdjustment) {
38193847
// Merge with any previous or following adjustment instruction. Note: the
38203848
// instructions merged with here do not have CFI, so their stack
3821-
// adjustments do not feed into CfaAdjustment.
3822-
StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
3823-
StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
3849+
// adjustments do not feed into CfaAdjustment
3850+
3851+
auto CalcCfaAdjust = [&CfaAdjustment](MachineBasicBlock::iterator PI,
3852+
int64_t Offset) {
3853+
CfaAdjustment += Offset;
3854+
};
3855+
auto CalcNewOffset = [&StackAdjustment](int64_t Offset) {
3856+
return StackAdjustment + Offset;
3857+
};
3858+
StackAdjustment =
3859+
mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, true);
3860+
StackAdjustment =
3861+
mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, false);
38243862

38253863
if (StackAdjustment) {
38263864
if (!(F.hasMinSize() &&
@@ -3830,22 +3868,19 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
38303868
}
38313869
}
38323870

3833-
if (DwarfCFI && !hasFP(MF)) {
3871+
if (DwarfCFI && !hasFP(MF) && CfaAdjustment) {
38343872
// If we don't have FP, but need to generate unwind information,
38353873
// we need to set the correct CFA offset after the stack adjustment.
38363874
// How much we adjust the CFA offset depends on whether we're emitting
38373875
// CFI only for EH purposes or for debugging. EH only requires the CFA
38383876
// offset to be correct at each call site, while for debugging we want
38393877
// it to be more precise.
38403878

3841-
int64_t CfaAdjustment = -StackAdjustment;
38423879
// TODO: When not using precise CFA, we also need to adjust for the
38433880
// InternalAmt here.
3844-
if (CfaAdjustment) {
3845-
BuildCFI(
3846-
MBB, InsertPos, DL,
3847-
MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment));
3848-
}
3881+
BuildCFI(
3882+
MBB, InsertPos, DL,
3883+
MCCFIInstruction::createAdjustCfaOffset(nullptr, -CfaAdjustment));
38493884
}
38503885

38513886
return I;

llvm/lib/Target/X86/X86FrameLowering.h

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -134,12 +134,50 @@ class X86FrameLowering : public TargetFrameLowering {
134134
processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
135135
RegScavenger *RS) const override;
136136

137-
/// Check the instruction before/after the passed instruction. If
138-
/// it is an ADD/SUB/LEA instruction it is deleted argument and the
139-
/// stack adjustment is returned as a positive value for ADD/LEA and
140-
/// a negative for SUB.
141-
int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
142-
bool doMergeWithPrevious) const;
137+
private:
138+
/// Basic Pseudocode:
139+
/// if (instruction before/after the passed instruction is ADD/SUB/LEA)
140+
/// Offset = instruction stack adjustment
141+
/// ... positive value for ADD/LEA and negative for SUB
142+
/// FoundStackAdjust(instruction, Offset)
143+
/// erase(instruction)
144+
/// return CalcNewOffset(Offset)
145+
/// else
146+
/// return CalcNewOffset(0)
147+
///
148+
/// It's possible that the selected instruction is not immediately
149+
/// before/after MBBI for large adjustments that have been split into multiple
150+
/// instructions.
151+
///
152+
/// FoundStackAdjust should have the signature:
153+
/// void FoundStackAdjust(MachineBasicBlock::iterator PI, int64_t Offset)
154+
/// CalcNewOffset should have the signature:
155+
/// int64_t CalcNewOffset(int64_t Offset)
156+
template <typename FoundT, typename CalcT>
157+
int64_t mergeSPUpdates(MachineBasicBlock &MBB,
158+
MachineBasicBlock::iterator &MBBI,
159+
FoundT FoundStackAdjust, CalcT CalcNewOffset,
160+
bool doMergeWithPrevious) const;
161+
162+
template <typename CalcT>
163+
int64_t mergeSPUpdates(MachineBasicBlock &MBB,
164+
MachineBasicBlock::iterator &MBBI, CalcT CalcNewOffset,
165+
bool doMergeWithPrevious) const {
166+
auto FoundStackAdjust = [](MachineBasicBlock::iterator MBBI,
167+
int64_t Offset) {};
168+
return mergeSPUpdates(MBB, MBBI, FoundStackAdjust, CalcNewOffset,
169+
doMergeWithPrevious);
170+
}
171+
172+
public:
173+
/// Equivalent to:
174+
/// mergeSPUpdates(MBB, MBBI,
175+
/// [AddOffset](int64_t Offset) {
176+
/// return AddOffset + Offset;
177+
/// },
178+
/// doMergeWithPrevious);
179+
int64_t mergeSPAdd(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
180+
int64_t AddOffset, bool doMergeWithPrevious) const;
143181

144182
/// Emit a series of instructions to increment / decrement the stack
145183
/// pointer by a constant value.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
; RUN: llc < %s -mtriple=x86_64-linux-unknown -verify-machineinstrs -o %t.s
2+
; RUN: FileCheck --input-file=%t.s %s
3+
4+
; Double-check that we are able to assemble the generated '.s'. A symptom of the
5+
; problem that led to this test is an assembler failure when using
6+
; '-save-temps'. For example:
7+
;
8+
; > ...s:683:7: error: invalid operand for instruction
9+
; > addq $2147483679, %rsp # imm = 0x8000001F
10+
;
11+
; RUN: llvm-mc -triple x86_64-unknown-unknown %t.s
12+
13+
; Check that the stack update after calling bar gets merged into the second add
14+
; and not the first which is already at the chunk size limit (0x7FFFFFFF).
15+
16+
define void @foo(ptr %rhs) {
17+
; CHECK-LABEL: foo
18+
entry:
19+
%lhs = alloca [5 x [5 x [3 x [162 x [161 x [161 x double]]]]]], align 16
20+
store ptr %lhs, ptr %rhs, align 8
21+
%0 = call i32 @baz()
22+
call void @bar(i64 0, i64 0, i64 0, i64 0, i64 0, ptr null, ptr %rhs, ptr null, ptr %rhs)
23+
; CHECK: call{{.*}}bar
24+
; CHECK: addq{{.*}}$2147483647, %rsp
25+
; CHECK: addq{{.*}}$372037585, %rsp
26+
; CHECK: .cfi_adjust_cfa_offset -2519521232
27+
ret void
28+
}
29+
30+
declare void @bar(i64, i64, i64, i64, i64, ptr, ptr, ptr, ptr)
31+
32+
declare i32 @baz()

0 commit comments

Comments
 (0)