Skip to content

[llvm][X86] Fix merging of large sp updates #125007

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86ExpandPseudo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,
// Adjust stack pointer.
int StackAdj = StackAdjust.getImm();
int MaxTCDelta = X86FI->getTCReturnAddrDelta();
int Offset = 0;
int64_t Offset = 0;
assert(MaxTCDelta <= 0 && "MaxTCDelta should never be positive");

// Incoporate the retaddr area.
Expand All @@ -297,7 +297,7 @@ bool X86ExpandPseudo::expandMI(MachineBasicBlock &MBB,

if (Offset) {
// Check for possible merge with preceding ADD instruction.
Offset += X86FL->mergeSPUpdates(MBB, MBBI, true);
Offset = X86FL->mergeSPAdd(MBB, MBBI, Offset, true);
X86FL->emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue=*/true);
}

Expand Down
121 changes: 78 additions & 43 deletions llvm/lib/Target/X86/X86FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ flagsNeedToBePreservedBeforeTheTerminators(const MachineBasicBlock &MBB) {
return false;
}

constexpr int64_t MaxSPChunk = (1LL << 31) - 1;

/// emitSPUpdate - Emit a series of instructions to increment / decrement the
/// stack pointer by a constant value.
void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
Expand All @@ -242,7 +244,7 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
return;
}

uint64_t Chunk = (1LL << 31) - 1;
uint64_t Chunk = MaxSPChunk;

MachineFunction &MF = *MBB.getParent();
const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
Expand Down Expand Up @@ -391,12 +393,15 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
return MI;
}

int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
bool doMergeWithPrevious) const {
template <typename FoundT, typename CalcT>
int64_t X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
FoundT FoundStackAdjust,
CalcT CalcNewOffset,
bool doMergeWithPrevious) const {
if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
(!doMergeWithPrevious && MBBI == MBB.end()))
return 0;
return CalcNewOffset(0);

MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;

Expand All @@ -415,27 +420,38 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
if (doMergeWithPrevious && PI != MBB.begin() && PI->isCFIInstruction())
PI = std::prev(PI);

unsigned Opc = PI->getOpcode();
int Offset = 0;

if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
PI->getOperand(0).getReg() == StackPtr) {
assert(PI->getOperand(1).getReg() == StackPtr);
Offset = PI->getOperand(2).getImm();
} else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
PI->getOperand(0).getReg() == StackPtr &&
PI->getOperand(1).getReg() == StackPtr &&
PI->getOperand(2).getImm() == 1 &&
PI->getOperand(3).getReg() == X86::NoRegister &&
PI->getOperand(5).getReg() == X86::NoRegister) {
// For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
Offset = PI->getOperand(4).getImm();
} else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&
PI->getOperand(0).getReg() == StackPtr) {
assert(PI->getOperand(1).getReg() == StackPtr);
Offset = -PI->getOperand(2).getImm();
} else
return 0;
int64_t Offset = 0;
for (;;) {
unsigned Opc = PI->getOpcode();

if ((Opc == X86::ADD64ri32 || Opc == X86::ADD32ri) &&
PI->getOperand(0).getReg() == StackPtr) {
assert(PI->getOperand(1).getReg() == StackPtr);
Offset = PI->getOperand(2).getImm();
} else if ((Opc == X86::LEA32r || Opc == X86::LEA64_32r) &&
PI->getOperand(0).getReg() == StackPtr &&
PI->getOperand(1).getReg() == StackPtr &&
PI->getOperand(2).getImm() == 1 &&
PI->getOperand(3).getReg() == X86::NoRegister &&
PI->getOperand(5).getReg() == X86::NoRegister) {
// For LEAs we have: def = lea SP, FI, noreg, Offset, noreg.
Offset = PI->getOperand(4).getImm();
} else if ((Opc == X86::SUB64ri32 || Opc == X86::SUB32ri) &&
PI->getOperand(0).getReg() == StackPtr) {
assert(PI->getOperand(1).getReg() == StackPtr);
Offset = -PI->getOperand(2).getImm();
} else
return CalcNewOffset(0);

FoundStackAdjust(PI, Offset);
if (std::abs((int64_t)CalcNewOffset(Offset)) < MaxSPChunk)
break;

if (doMergeWithPrevious ? (PI == MBB.begin()) : (PI == MBB.end()))
return CalcNewOffset(0);

PI = doMergeWithPrevious ? std::prev(PI) : std::next(PI);
}

PI = MBB.erase(PI);
if (PI != MBB.end() && PI->isCFIInstruction()) {
Expand All @@ -448,7 +464,16 @@ int X86FrameLowering::mergeSPUpdates(MachineBasicBlock &MBB,
if (!doMergeWithPrevious)
MBBI = skipDebugInstructionsForward(PI, MBB.end());

return Offset;
return CalcNewOffset(Offset);
}

int64_t X86FrameLowering::mergeSPAdd(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
int64_t AddOffset,
bool doMergeWithPrevious) const {
return mergeSPUpdates(
MBB, MBBI, [AddOffset](int64_t Offset) { return AddOffset + Offset; },
doMergeWithPrevious);
}

void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
Expand Down Expand Up @@ -1975,8 +2000,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,

// If there is an SUB32ri of ESP immediately before this instruction, merge
// the two. This can be the case when tail call elimination is enabled and
// the callee has more arguments then the caller.
NumBytes -= mergeSPUpdates(MBB, MBBI, true);
// the callee has more arguments than the caller.
NumBytes = mergeSPUpdates(
MBB, MBBI, [NumBytes](int64_t Offset) { return NumBytes - Offset; },
true);

// Adjust stack pointer: ESP -= numbytes.

Expand Down Expand Up @@ -2457,7 +2484,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (HasFP) {
if (X86FI->hasSwiftAsyncContext()) {
// Discard the context.
int Offset = 16 + mergeSPUpdates(MBB, MBBI, true);
int64_t Offset = mergeSPAdd(MBB, MBBI, 16, true);
emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
}
// Pop EBP.
Expand Down Expand Up @@ -2531,7 +2558,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// If there is an ADD32ri or SUB32ri of ESP immediately before this
// instruction, merge the two instructions.
if (NumBytes || MFI.hasVarSizedObjects())
NumBytes += mergeSPUpdates(MBB, MBBI, true);
NumBytes = mergeSPAdd(MBB, MBBI, NumBytes, true);

// If dynamic alloca is used, then reset esp to point to the last callee-saved
// slot before popping them off! Same applies for the case, when stack was
Expand Down Expand Up @@ -2612,11 +2639,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,

if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
// Add the return addr area delta back since we are not tail calling.
int Offset = -1 * X86FI->getTCReturnAddrDelta();
int64_t Offset = -1 * X86FI->getTCReturnAddrDelta();
assert(Offset >= 0 && "TCDelta should never be positive");
if (Offset) {
// Check for possible merge with preceding ADD instruction.
Offset += mergeSPUpdates(MBB, Terminator, true);
Offset = mergeSPAdd(MBB, Terminator, Offset, true);
emitSPUpdate(MBB, Terminator, DL, Offset, /*InEpilogue=*/true);
}
}
Expand Down Expand Up @@ -3814,13 +3841,24 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(

// Add Amount to SP to destroy a frame, or subtract to setup.
int64_t StackAdjustment = isDestroy ? Amount : -Amount;
int64_t CfaAdjustment = StackAdjustment;

if (StackAdjustment) {
// Merge with any previous or following adjustment instruction. Note: the
// instructions merged with here do not have CFI, so their stack
// adjustments do not feed into CfaAdjustment.
StackAdjustment += mergeSPUpdates(MBB, InsertPos, true);
StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
// adjustments do not feed into CfaAdjustment

auto CalcCfaAdjust = [&CfaAdjustment](MachineBasicBlock::iterator PI,
int64_t Offset) {
CfaAdjustment += Offset;
};
auto CalcNewOffset = [&StackAdjustment](int64_t Offset) {
return StackAdjustment + Offset;
};
StackAdjustment =
mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, true);
StackAdjustment =
mergeSPUpdates(MBB, InsertPos, CalcCfaAdjust, CalcNewOffset, false);

if (StackAdjustment) {
if (!(F.hasMinSize() &&
Expand All @@ -3830,22 +3868,19 @@ MachineBasicBlock::iterator X86FrameLowering::eliminateCallFramePseudoInstr(
}
}

if (DwarfCFI && !hasFP(MF)) {
if (DwarfCFI && !hasFP(MF) && CfaAdjustment) {
// If we don't have FP, but need to generate unwind information,
// we need to set the correct CFA offset after the stack adjustment.
// How much we adjust the CFA offset depends on whether we're emitting
// CFI only for EH purposes or for debugging. EH only requires the CFA
// offset to be correct at each call site, while for debugging we want
// it to be more precise.

int64_t CfaAdjustment = -StackAdjustment;
// TODO: When not using precise CFA, we also need to adjust for the
// InternalAmt here.
if (CfaAdjustment) {
BuildCFI(
MBB, InsertPos, DL,
MCCFIInstruction::createAdjustCfaOffset(nullptr, CfaAdjustment));
}
BuildCFI(
MBB, InsertPos, DL,
MCCFIInstruction::createAdjustCfaOffset(nullptr, -CfaAdjustment));
}

return I;
Expand Down
50 changes: 44 additions & 6 deletions llvm/lib/Target/X86/X86FrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -134,12 +134,50 @@ class X86FrameLowering : public TargetFrameLowering {
processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
RegScavenger *RS) const override;

/// Check the instruction before/after the passed instruction. If
/// it is an ADD/SUB/LEA instruction it is deleted argument and the
/// stack adjustment is returned as a positive value for ADD/LEA and
/// a negative for SUB.
int mergeSPUpdates(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
bool doMergeWithPrevious) const;
private:
/// Basic Pseudocode:
/// if (instruction before/after the passed instruction is ADD/SUB/LEA)
/// Offset = instruction stack adjustment
/// ... positive value for ADD/LEA and negative for SUB
/// FoundStackAdjust(instruction, Offset)
/// erase(instruction)
/// return CalcNewOffset(Offset)
/// else
/// return CalcNewOffset(0)
///
/// It's possible that the selected instruction is not immediately
/// before/after MBBI for large adjustments that have been split into multiple
/// instructions.
///
/// FoundStackAdjust should have the signature:
/// void FoundStackAdjust(MachineBasicBlock::iterator PI, int64_t Offset)
/// CalcNewOffset should have the signature:
/// int64_t CalcNewOffset(int64_t Offset)
template <typename FoundT, typename CalcT>
int64_t mergeSPUpdates(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI,
FoundT FoundStackAdjust, CalcT CalcNewOffset,
bool doMergeWithPrevious) const;

template <typename CalcT>
int64_t mergeSPUpdates(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MBBI, CalcT CalcNewOffset,
bool doMergeWithPrevious) const {
auto FoundStackAdjust = [](MachineBasicBlock::iterator MBBI,
int64_t Offset) {};
return mergeSPUpdates(MBB, MBBI, FoundStackAdjust, CalcNewOffset,
doMergeWithPrevious);
}

public:
/// Equivalent to:
/// mergeSPUpdates(MBB, MBBI,
/// [AddOffset](int64_t Offset) {
/// return AddOffset + Offset;
/// },
/// doMergeWithPrevious);
int64_t mergeSPAdd(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
int64_t AddOffset, bool doMergeWithPrevious) const;

/// Emit a series of instructions to increment / decrement the stack
/// pointer by a constant value.
Expand Down
32 changes: 32 additions & 0 deletions llvm/test/CodeGen/X86/merge-huge-sp-updates.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
; RUN: llc < %s -mtriple=x86_64-linux-unknown -verify-machineinstrs -o %t.s
; RUN: FileCheck --input-file=%t.s %s

; Double-check that we are able to assemble the generated '.s'. A symptom of the
; problem that led to this test is an assembler failure when using
; '-save-temps'. For example:
;
; > ...s:683:7: error: invalid operand for instruction
; > addq $2147483679, %rsp # imm = 0x8000001F
;
; RUN: llvm-mc -triple x86_64-unknown-unknown %t.s

; Check that the stack update after calling bar gets merged into the second add
; and not the first which is already at the chunk size limit (0x7FFFFFFF).

define void @foo(ptr %rhs) {
; CHECK-LABEL: foo
entry:
%lhs = alloca [5 x [5 x [3 x [162 x [161 x [161 x double]]]]]], align 16
store ptr %lhs, ptr %rhs, align 8
%0 = call i32 @baz()
call void @bar(i64 0, i64 0, i64 0, i64 0, i64 0, ptr null, ptr %rhs, ptr null, ptr %rhs)
; CHECK: call{{.*}}bar
; CHECK: addq{{.*}}$2147483647, %rsp
; CHECK: addq{{.*}}$372037585, %rsp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I compared the assembly before and after this patch, but the CFI directive doesn't look correct to me after this patch.

Before:

        addq    $2147483679, %rsp               # imm = 0x8000001F
        .cfi_adjust_cfa_offset -2147483679
        addq    $372037553, %rsp                # imm = 0x162CD7B1
        .cfi_def_cfa_offset 16
        popq    %rbx
        .cfi_def_cfa_offset 8
        retq

After:

        addq    $2147483647, %rsp               # imm = 0x7FFFFFFF
        addq    $372037585, %rsp                # imm = 0x162CD7D1
        .cfi_adjust_cfa_offset -372037585
        popq    %rbx
        .cfi_def_cfa_offset 8
        retq

Can you take a look please?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Latest version updates .cfi_adjust_cfa_offset to account for total stack adjustment:

        addq    $2147483647, %rsp               # imm = 0x7FFFFFFF
        addq    $372037585, %rsp                # imm = 0x162CD7D1
        .cfi_adjust_cfa_offset -2519521232
        popq    %rbx
        .cfi_def_cfa_offset 8
        retq

My understanding is limited, but the .cfi_adjust_cfa_offset before my change seems incorrect in that it does not account for the following addq.

; CHECK: .cfi_adjust_cfa_offset -2519521232
ret void
}

declare void @bar(i64, i64, i64, i64, i64, ptr, ptr, ptr, ptr)

declare i32 @baz()