Skip to content

[AArch64] Prevent the AArch64LoadStoreOptimizer from reordering CFI instructions #101317

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 79 additions & 30 deletions llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,12 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
// pre or post indexed addressing with writeback. Scan backwards.
// `MergeEither` is set to true if the combined instruction may be placed
// either at the location of the load/store instruction or at the location of
// the update intruction.
MachineBasicBlock::iterator
findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit);
findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit,
bool &MergeEither);

// Find an instruction that updates the base register of the ld/st
// instruction.
Expand All @@ -202,9 +206,10 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
unsigned IndexReg, unsigned &Offset);

// Merge a pre- or post-index base register update into a ld/st instruction.
MachineBasicBlock::iterator
std::optional<MachineBasicBlock::iterator>
mergeUpdateInsn(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Update, bool IsPreIdx);
MachineBasicBlock::iterator Update, bool IsForward,
bool IsPreIdx, bool MergeEither);

MachineBasicBlock::iterator
mergeConstOffsetInsn(MachineBasicBlock::iterator I,
Expand Down Expand Up @@ -2070,20 +2075,37 @@ maybeMoveCFI(MachineInstr &MI, MachineBasicBlock::iterator MaybeCFI) {
}
}

MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Update,
bool IsPreIdx) {
std::optional<MachineBasicBlock::iterator> AArch64LoadStoreOpt::mergeUpdateInsn(
MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update,
bool IsForward, bool IsPreIdx, bool MergeEither) {
assert((Update->getOpcode() == AArch64::ADDXri ||
Update->getOpcode() == AArch64::SUBXri) &&
"Unexpected base register update instruction to merge!");
MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock::iterator NextI = next_nodbg(I, E);

// If updating the SP and the following instruction is CFA offset related CFI
// instruction move it after the merged instruction.
MachineBasicBlock::iterator CFI =
IsPreIdx ? maybeMoveCFI(*Update, next_nodbg(Update, E)) : E;
// If updating the SP and the following instruction is CFA offset related CFI,
// make sure the CFI follows the SP update either by merging at the location
// of the update or by moving the CFI after the merged instruction. If unable
// to do so, bail.
MachineBasicBlock::iterator InsertPt = I;
if (IsForward) {
assert(IsPreIdx);
if (auto CFI = maybeMoveCFI(*Update, next_nodbg(Update, E)); CFI != E) {
if (MergeEither) {
InsertPt = Update;
} else {
// Take care not to reorder CFIs.
if (std::any_of(std::next(CFI), I, [](const auto &Insn) {
return Insn.getOpcode() == TargetOpcode::CFI_INSTRUCTION;
}))
return std::nullopt;

MachineBasicBlock *MBB = InsertPt->getParent();
MBB->splice(std::next(InsertPt), MBB, CFI);
}
}
}

// Return the instruction following the merged instruction, which is
// the instruction following our unmerged load. Unless that's the add/sub
Expand All @@ -2104,7 +2126,8 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset);
if (!AArch64InstrInfo::isPairedLdSt(*I)) {
// Non-paired instruction.
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
MIB = BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
TII->get(NewOpc))
.add(Update->getOperand(0))
.add(getLdStRegOp(*I))
.add(AArch64InstrInfo::getLdStBaseOp(*I))
Expand All @@ -2113,7 +2136,8 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
.setMIFlags(I->mergeFlagsWith(*Update));
} else {
// Paired instruction.
MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
MIB = BuildMI(*InsertPt->getParent(), InsertPt, InsertPt->getDebugLoc(),
TII->get(NewOpc))
.add(Update->getOperand(0))
.add(getLdStRegOp(*I, 0))
.add(getLdStRegOp(*I, 1))
Expand All @@ -2122,10 +2146,6 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
.setMemRefs(I->memoperands())
.setMIFlags(I->mergeFlagsWith(*Update));
}
if (CFI != E) {
MachineBasicBlock *MBB = I->getParent();
MBB->splice(std::next(MIB.getInstr()->getIterator()), MBB, CFI);
}

if (IsPreIdx) {
++NumPreFolded;
Expand Down Expand Up @@ -2360,7 +2380,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
}

MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
MachineBasicBlock::iterator I, unsigned Limit) {
MachineBasicBlock::iterator I, unsigned Limit, bool &MergeEither) {
MachineBasicBlock::iterator B = I->getParent()->begin();
MachineBasicBlock::iterator E = I->getParent()->end();
MachineInstr &MemMI = *I;
Expand All @@ -2370,19 +2390,21 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
Register BaseReg = AArch64InstrInfo::getLdStBaseOp(MemMI).getReg();
int Offset = AArch64InstrInfo::getLdStOffsetOp(MemMI).getImm();

bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
Register DestReg[] = {getLdStRegOp(MemMI, 0).getReg(),
IsPairedInsn ? getLdStRegOp(MemMI, 1).getReg()
: AArch64::NoRegister};

// If the load/store is the first instruction in the block, there's obviously
// not any matching update. Ditto if the memory offset isn't zero.
if (MBBI == B || Offset != 0)
return E;
// If the base register overlaps a destination register, we can't
// merge the update.
if (!isTagStore(MemMI)) {
bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI);
for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) {
Register DestReg = getLdStRegOp(MemMI, i).getReg();
if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg))
for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i)
if (DestReg[i] == BaseReg || TRI->isSubRegister(BaseReg, DestReg[i]))
return E;
}
}

const bool BaseRegSP = BaseReg == AArch64::SP;
Expand All @@ -2403,6 +2425,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
UsedRegUnits.clear();
unsigned Count = 0;
bool MemAcessBeforeSPPreInc = false;
MergeEither = true;
do {
MBBI = prev_nodbg(MBBI, B);
MachineInstr &MI = *MBBI;
Expand All @@ -2429,6 +2452,20 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
if (!ModifiedRegUnits.available(BaseReg) ||
!UsedRegUnits.available(BaseReg))
return E;

// If we have a destination register (i.e. a load instruction) and a
// destination register is used or modified, then we can only merge forward,
// i.e. the combined instruction is put in the place of the memory
// instruction. Same applies if we see a memory access or side effects.
if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects() ||
(DestReg[0] != AArch64::NoRegister &&
!(ModifiedRegUnits.available(DestReg[0]) &&
UsedRegUnits.available(DestReg[0]))) ||
(DestReg[1] != AArch64::NoRegister &&
!(ModifiedRegUnits.available(DestReg[1]) &&
UsedRegUnits.available(DestReg[1]))))
MergeEither = false;

// Keep track if we have a memory access before an SP pre-increment, in this
// case we need to validate later that the update amount respects the red
// zone.
Expand Down Expand Up @@ -2639,8 +2676,12 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit);
if (Update != E) {
// Merge the update into the ld/st.
MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false);
return true;
if (auto NextI = mergeUpdateInsn(MBBI, Update, /*IsForward=*/false,
/*IsPreIdx=*/false,
/*MergeEither=*/false)) {
MBBI = *NextI;
return true;
}
}

// Don't know how to handle unscaled pre/post-index versions below, so bail.
Expand All @@ -2652,11 +2693,15 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
// ldr x1, [x0]
// merged into:
// ldr x1, [x0, #8]!
Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit);
bool MergeEither;
Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit, MergeEither);
if (Update != E) {
// Merge the update into the ld/st.
MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
return true;
if (auto NextI = mergeUpdateInsn(MBBI, Update, /*IsForward=*/true,
/*IsPreIdx=*/true, MergeEither)) {
MBBI = *NextI;
return true;
}
}

// The immediate in the load/store is scaled by the size of the memory
Expand All @@ -2673,8 +2718,12 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit);
if (Update != E) {
// Merge the update into the ld/st.
MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true);
return true;
if (auto NextI = mergeUpdateInsn(MBBI, Update, /*IsForward=*/false,
/*IsPreIdx=*/true,
/*MergeEither=*/false)) {
MBBI = *NextI;
return true;
}
}

return false;
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/AArch64/build-one-lane.ll
Original file line number Diff line number Diff line change
Expand Up @@ -318,9 +318,9 @@ define void @v2f64st(ptr %p, double %s) nounwind {
define <32 x i8> @test_lanex_32xi8(<32 x i8> %a, i32 %x) {
; CHECK-LABEL: test_lanex_32xi8:
; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-NEXT: and x8, x0, #0x1f
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: mov w10, #30 // =0x1e
Expand Down
28 changes: 14 additions & 14 deletions llvm/test/CodeGen/AArch64/insertextract.ll
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,9 @@ entry:
define <4 x double> @insert_v4f64_c(<4 x double> %a, double %b, i32 %c) {
; CHECK-SD-LABEL: insert_v4f64_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0x3
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: str d2, [x9, x8, lsl #3]
Expand Down Expand Up @@ -387,9 +387,9 @@ entry:
define <8 x float> @insert_v8f32_c(<8 x float> %a, float %b, i32 %c) {
; CHECK-SD-LABEL: insert_v8f32_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0x7
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: str s2, [x9, x8, lsl #2]
Expand Down Expand Up @@ -552,9 +552,9 @@ entry:
define <16 x half> @insert_v16f16_c(<16 x half> %a, half %b, i32 %c) {
; CHECK-SD-LABEL: insert_v16f16_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xf
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: str h2, [x9, x8, lsl #1]
Expand Down Expand Up @@ -715,9 +715,9 @@ entry:
define <32 x i8> @insert_v32i8_c(<32 x i8> %a, i8 %b, i32 %c) {
; CHECK-SD-LABEL: insert_v32i8_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-NEXT: and x8, x1, #0x1f
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: strb w0, [x9, x8]
Expand Down Expand Up @@ -876,9 +876,9 @@ entry:
define <16 x i16> @insert_v16i16_c(<16 x i16> %a, i16 %b, i32 %c) {
; CHECK-SD-LABEL: insert_v16i16_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-NEXT: and x8, x1, #0xf
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: strh w0, [x9, x8, lsl #1]
Expand Down Expand Up @@ -1103,9 +1103,9 @@ entry:
define <8 x i32> @insert_v8i32_c(<8 x i32> %a, i32 %b, i32 %c) {
; CHECK-SD-LABEL: insert_v8i32_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-NEXT: and x8, x1, #0x7
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: str w0, [x9, x8, lsl #2]
Expand Down Expand Up @@ -1288,9 +1288,9 @@ entry:
define <4 x i64> @insert_v4i64_c(<4 x i64> %a, i64 %b, i32 %c) {
; CHECK-SD-LABEL: insert_v4i64_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
; CHECK-SD-NEXT: and x8, x1, #0x3
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: str x0, [x9, x8, lsl #3]
Expand Down Expand Up @@ -1454,9 +1454,9 @@ entry:
define double @extract_v4f64_c(<4 x double> %a, i32 %c) {
; CHECK-SD-LABEL: extract_v4f64_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0x3
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: ldr d0, [x9, x8, lsl #3]
Expand Down Expand Up @@ -1662,9 +1662,9 @@ entry:
define float @extract_v8f32_c(<8 x float> %a, i32 %c) {
; CHECK-SD-LABEL: extract_v8f32_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0x7
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: ldr s0, [x9, x8, lsl #2]
Expand Down Expand Up @@ -1821,9 +1821,9 @@ entry:
define half @extract_v16f16_c(<16 x half> %a, i32 %c) {
; CHECK-SD-LABEL: extract_v16f16_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xf
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: ldr h0, [x9, x8, lsl #1]
Expand Down Expand Up @@ -1979,9 +1979,9 @@ entry:
define i8 @extract_v32i8_c(<32 x i8> %a, i32 %c) {
; CHECK-SD-LABEL: extract_v32i8_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0x1f
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: ldrb w0, [x9, x8]
Expand Down Expand Up @@ -2135,9 +2135,9 @@ entry:
define i16 @extract_v16i16_c(<16 x i16> %a, i32 %c) {
; CHECK-SD-LABEL: extract_v16i16_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xf
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: ldrh w0, [x9, x8, lsl #1]
Expand Down Expand Up @@ -2368,9 +2368,9 @@ entry:
define i32 @extract_v8i32_c(<8 x i32> %a, i32 %c) {
; CHECK-SD-LABEL: extract_v8i32_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0x7
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: ldr w0, [x9, x8, lsl #2]
Expand Down Expand Up @@ -2551,9 +2551,9 @@ entry:
define i64 @extract_v4i64_c(<4 x i64> %a, i32 %c) {
; CHECK-SD-LABEL: extract_v4i64_c:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: stp q0, q1, [sp, #-32]!
; CHECK-SD-NEXT: .cfi_def_cfa_offset 32
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0x3
; CHECK-SD-NEXT: mov x9, sp
; CHECK-SD-NEXT: ldr x0, [x9, x8, lsl #3]
Expand Down
Loading
Loading