Skip to content

[LoongArch] Merge base and offset for tls-le code sequence #122999

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Feb 10, 2025
165 changes: 149 additions & 16 deletions llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,
MachineInstr *&Lo20, MachineInstr *&Hi12,
MachineInstr *&Last);
bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add,
MachineInstr *&Lo12);

bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,
MachineInstr *&Lo20, MachineInstr *&Hi12,
Expand Down Expand Up @@ -176,7 +178,80 @@ bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
return true;
}

// Update the offset in Hi20, Lo12, Lo20 and Hi12 instructions.
// Detect the pattern:
//
// (small/medium):
// lu12i.w vreg1, %le_hi20_r(s)
// add.w/d vreg2, vreg1, r2, %le_add_r(s)
// addi.w/d vreg3, vreg2, %le_lo12_r(s)

// The pattern is only accepted if:
// 1) The first instruction has only one use, which is the PseudoAddTPRel.
// The second instruction has only one use, which is the ADDI. The
// second instruction's last operand is the tp register.
// 2) The address operands have the appropriate type, reflecting the
// lowering of a thread_local global address using the pattern.
// 3) The offset value in the ThreadLocal Global Address is 0.
bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
MachineInstr *&Add,
MachineInstr *&Lo12) {
if (Hi20.getOpcode() != LoongArch::LU12I_W)
return false;

auto isGlobalOrCPI = [](const MachineOperand &Op) {
return Op.isGlobal() || Op.isCPI();
};

const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_LE_HI_R ||
!isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0)
return false;

Register HiDestReg = Hi20.getOperand(0).getReg();
if (!MRI->hasOneUse(HiDestReg))
return false;

Add = &*MRI->use_instr_begin(HiDestReg);
if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) ||
(!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W))
return false;

if (Add->getOperand(2).getReg() != LoongArch::R2)
return false;

const MachineOperand &AddOp3 = Add->getOperand(3);
if (LoongArchII::getDirectFlags(AddOp3) != LoongArchII::MO_LE_ADD_R ||
!(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) ||
AddOp3.getOffset() != 0)
return false;

Register AddDestReg = Add->getOperand(0).getReg();
if (!MRI->hasOneUse(AddDestReg))
return false;

Lo12 = &*MRI->use_instr_begin(AddDestReg);
if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
(!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
return false;

const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_LE_LO_R ||
!(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
Lo12Op2.getOffset() != 0)
return false;

if (Hi20Op1.isGlobal()) {
LLVM_DEBUG(dbgs() << " Found lowered global address: "
<< *Hi20Op1.getGlobal() << "\n");
} else if (Hi20Op1.isCPI()) {
LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()
<< "\n");
}

return true;
}

// Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions.
// Delete the tail instruction and update all the uses to use the
// output from Last.
void LoongArchMergeBaseOffsetOpt::foldOffset(
Expand All @@ -190,31 +265,49 @@ void LoongArchMergeBaseOffsetOpt::foldOffset(
Lo20->getOperand(2).setOffset(Offset);
Hi12->getOperand(2).setOffset(Offset);
}

// For tls-le, offset of the second PseudoAddTPRel instr should also be
// updated.
MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
if (Hi20.getOpcode() == LoongArch::LU12I_W)
Add->getOperand(3).setOffset(Offset);

// Delete the tail instruction.
MachineInstr *Def = Last ? Last : &Lo12;
MRI->constrainRegClass(Def->getOperand(0).getReg(),
MRI->getRegClass(Tail.getOperand(0).getReg()));
MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg());
Tail.eraseFromParent();

LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n"
<< " " << Hi20 << " " << Lo12;);
<< " " << Hi20;);
if (Hi20.getOpcode() == LoongArch::LU12I_W) {
LLVM_DEBUG(dbgs() << " " << *Add;);
}
LLVM_DEBUG(dbgs() << " " << Lo12;);
if (Lo20 && Hi12) {
LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;);
}
}

// Detect patterns for large offsets that are passed into an ADD instruction.
// If the pattern is found, updates the offset in Hi20, Lo12, Lo20 and Hi12
// instructions and deletes TailAdd and the instructions that produced the
// offset.
// If the pattern is found, updates the offset in Hi20, (Add), Lo12,
// (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that
// produced the offset.
//
// (The instructions marked with "!" are not necessarily present)
//
// Base address lowering is of the form:
// Hi20: pcalau12i vreg1, %pc_hi20(s)
// +- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
// | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
// +- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
// 1) pcala:
// Hi20: pcalau12i vreg1, %pc_hi20(s)
// +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
// | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
// +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
// |
// | 2) tls-le:
// | Hi20: lu12i.w vreg1, %le_hi20_r(s)
// | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s)
// +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s)
// |
// | The large offset can be one of the forms:
// |
Expand Down Expand Up @@ -334,7 +427,8 @@ bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,

// Look for arithmetic instructions we can get an offset from.
// We might be able to remove the arithmetic instructions by folding the
// offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I).
// offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or
// LU12I_W+PseudoAddTPRel+ADDI.
if (!MRI->hasOneUse(DestReg))
return false;

Expand Down Expand Up @@ -454,6 +548,7 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
// If all the uses are memory ops with the same offset, we can transform:
//
// 1. (small/medium):
// 1.1. pcala
// pcalau12i vreg1, %pc_hi20(s)
// addi.d vreg2, vreg1, %pc_lo12(s)
// ld.w vreg3, 8(vreg2)
Expand All @@ -463,6 +558,18 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
// pcalau12i vreg1, %pc_hi20(s+8)
// ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1)
//
// 1.2. tls-le
// lu12i.w vreg1, %le_hi20_r(s)
// add.w/d vreg2, vreg1, r2, %le_add_r(s)
// addi.w/d vreg3, vreg2, %le_lo12_r(s)
// ld.w vreg4, 8(vreg3)
//
// =>
//
// lu12i.w vreg1, %le_hi20_r(s+8)
// add.w/d vreg2, vreg1, r2, %le_add_r(s+8)
// ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2)
//
// 2. (large):
// pcalau12i vreg1, %pc_hi20(s)
// addi.d vreg2, $zero, %pc_lo12(s)
Expand Down Expand Up @@ -598,7 +705,8 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
return false;

// If optimized by this pass successfully, MO_RELAX bitmask target-flag should
// be removed from the code sequence.
// be removed from the pcala code sequence. Code sequence of tls-le can still
// be relaxed after being optimized.
//
// For example:
// pcalau12i $a0, %pc_hi20(symbol)
Expand All @@ -614,15 +722,20 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
// optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
// carried by them.
Hi20.getOperand(1).setOffset(NewOffset);
Hi20.getOperand(1).setTargetFlags(
LoongArchII::getDirectFlags(Hi20.getOperand(1)));
MachineOperand &ImmOp = Lo12.getOperand(2);
ImmOp.setOffset(NewOffset);
ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
if (Lo20 && Hi12) {
Lo20->getOperand(2).setOffset(NewOffset);
Hi12->getOperand(2).setOffset(NewOffset);
}
if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
Hi20.getOperand(1).setTargetFlags(
LoongArchII::getDirectFlags(Hi20.getOperand(1)));
ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
Add->getOperand(3).setOffset(NewOffset);
}

// Update the immediate in the load/store instructions to add the offset.
const LoongArchInstrInfo &TII = *ST->getInstrInfo();
Expand Down Expand Up @@ -673,7 +786,14 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
return true;
}

MRI->replaceRegWith(Lo12.getOperand(0).getReg(), Hi20.getOperand(0).getReg());
if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
Hi20.getOperand(0).getReg());
} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
Add->getOperand(0).getReg());
}
Lo12.eraseFromParent();
return true;
}
Expand All @@ -693,8 +813,21 @@ bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
MachineInstr *Lo20 = nullptr;
MachineInstr *Hi12 = nullptr;
MachineInstr *Last = nullptr;
if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
// Detect foldable pcala code sequence in small/medium/large code model.
if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
continue;
} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
MachineInstr *Add = nullptr;
// Detect foldable tls-le code sequence in small/medium code model.
if (!detectFoldable(Hi20, Add, Lo12))
continue;
} else {
continue;
}
// For tls-le, we do not pass the second PseudoAddTPRel instr in order to
// reuse the existing hooks and the last three paramaters should always be
// nullptr.
MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);
MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);
}
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
Original file line number Diff line number Diff line change
Expand Up @@ -317,11 +317,10 @@ define void @test_la_tls_le(i32 signext %n) {
; LA32-NEXT: move $a1, $zero
; LA32-NEXT: lu12i.w $a2, %le_hi20_r(le)
; LA32-NEXT: add.w $a2, $a2, $tp, %le_add_r(le)
; LA32-NEXT: addi.w $a2, $a2, %le_lo12_r(le)
; LA32-NEXT: .p2align 4, , 16
; LA32-NEXT: .LBB4_1: # %loop
; LA32-NEXT: # =>This Inner Loop Header: Depth=1
; LA32-NEXT: ld.w $zero, $a2, 0
; LA32-NEXT: ld.w $zero, $a2, %le_lo12_r(le)
; LA32-NEXT: addi.w $a1, $a1, 1
; LA32-NEXT: blt $a1, $a0, .LBB4_1
; LA32-NEXT: # %bb.2: # %ret
Expand All @@ -332,11 +331,10 @@ define void @test_la_tls_le(i32 signext %n) {
; LA64-NEXT: move $a1, $zero
; LA64-NEXT: lu12i.w $a2, %le_hi20_r(le)
; LA64-NEXT: add.d $a2, $a2, $tp, %le_add_r(le)
; LA64-NEXT: addi.d $a2, $a2, %le_lo12_r(le)
; LA64-NEXT: .p2align 4, , 16
; LA64-NEXT: .LBB4_1: # %loop
; LA64-NEXT: # =>This Inner Loop Header: Depth=1
; LA64-NEXT: ld.w $zero, $a2, 0
; LA64-NEXT: ld.w $zero, $a2, %le_lo12_r(le)
; LA64-NEXT: addi.w $a1, $a1, 1
; LA64-NEXT: blt $a1, $a0, .LBB4_1
; LA64-NEXT: # %bb.2: # %ret
Expand Down
Loading