Skip to content

Commit 0b5c318

Browse files
authored
[LoongArch] Merge base and offset for tls-le code sequence (llvm#122999)
Adapt the merge base offset pass to optimize the tls-le code sequence.
1 parent 738cf5a commit 0b5c318

File tree

3 files changed

+266
-223
lines changed

3 files changed

+266
-223
lines changed

llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp

Lines changed: 149 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
3737
bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,
3838
MachineInstr *&Lo20, MachineInstr *&Hi12,
3939
MachineInstr *&Last);
40+
bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add,
41+
MachineInstr *&Lo12);
4042

4143
bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,
4244
MachineInstr *&Lo20, MachineInstr *&Hi12,
@@ -176,7 +178,80 @@ bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
176178
return true;
177179
}
178180

179-
// Update the offset in Hi20, Lo12, Lo20 and Hi12 instructions.
181+
// Detect the pattern:
182+
//
183+
// (small/medium):
184+
// lu12i.w vreg1, %le_hi20_r(s)
185+
// add.w/d vreg2, vreg1, r2, %le_add_r(s)
186+
// addi.w/d vreg3, vreg2, %le_lo12_r(s)
187+
188+
// The pattern is only accepted if:
189+
// 1) The first instruction has only one use, which is the PseudoAddTPRel.
190+
// The second instruction has only one use, which is the ADDI. The
191+
// second instruction's last operand is the tp register.
192+
// 2) The address operands have the appropriate type, reflecting the
193+
// lowering of a thread_local global address using the pattern.
194+
// 3) The offset value in the ThreadLocal Global Address is 0.
195+
bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
196+
MachineInstr *&Add,
197+
MachineInstr *&Lo12) {
198+
if (Hi20.getOpcode() != LoongArch::LU12I_W)
199+
return false;
200+
201+
auto isGlobalOrCPI = [](const MachineOperand &Op) {
202+
return Op.isGlobal() || Op.isCPI();
203+
};
204+
205+
const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
206+
if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_LE_HI_R ||
207+
!isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0)
208+
return false;
209+
210+
Register HiDestReg = Hi20.getOperand(0).getReg();
211+
if (!MRI->hasOneUse(HiDestReg))
212+
return false;
213+
214+
Add = &*MRI->use_instr_begin(HiDestReg);
215+
if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) ||
216+
(!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W))
217+
return false;
218+
219+
if (Add->getOperand(2).getReg() != LoongArch::R2)
220+
return false;
221+
222+
const MachineOperand &AddOp3 = Add->getOperand(3);
223+
if (LoongArchII::getDirectFlags(AddOp3) != LoongArchII::MO_LE_ADD_R ||
224+
!(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) ||
225+
AddOp3.getOffset() != 0)
226+
return false;
227+
228+
Register AddDestReg = Add->getOperand(0).getReg();
229+
if (!MRI->hasOneUse(AddDestReg))
230+
return false;
231+
232+
Lo12 = &*MRI->use_instr_begin(AddDestReg);
233+
if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
234+
(!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
235+
return false;
236+
237+
const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
238+
if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_LE_LO_R ||
239+
!(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
240+
Lo12Op2.getOffset() != 0)
241+
return false;
242+
243+
if (Hi20Op1.isGlobal()) {
244+
LLVM_DEBUG(dbgs() << " Found lowered global address: "
245+
<< *Hi20Op1.getGlobal() << "\n");
246+
} else if (Hi20Op1.isCPI()) {
247+
LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()
248+
<< "\n");
249+
}
250+
251+
return true;
252+
}
253+
254+
// Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions.
180255
// Delete the tail instruction and update all the uses to use the
181256
// output from Last.
182257
void LoongArchMergeBaseOffsetOpt::foldOffset(
@@ -190,31 +265,49 @@ void LoongArchMergeBaseOffsetOpt::foldOffset(
190265
Lo20->getOperand(2).setOffset(Offset);
191266
Hi12->getOperand(2).setOffset(Offset);
192267
}
268+
269+
// For tls-le, offset of the second PseudoAddTPRel instr should also be
270+
// updated.
271+
MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
272+
if (Hi20.getOpcode() == LoongArch::LU12I_W)
273+
Add->getOperand(3).setOffset(Offset);
274+
193275
// Delete the tail instruction.
194276
MachineInstr *Def = Last ? Last : &Lo12;
195277
MRI->constrainRegClass(Def->getOperand(0).getReg(),
196278
MRI->getRegClass(Tail.getOperand(0).getReg()));
197279
MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg());
198280
Tail.eraseFromParent();
281+
199282
LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n"
200-
<< " " << Hi20 << " " << Lo12;);
283+
<< " " << Hi20;);
284+
if (Hi20.getOpcode() == LoongArch::LU12I_W) {
285+
LLVM_DEBUG(dbgs() << " " << *Add;);
286+
}
287+
LLVM_DEBUG(dbgs() << " " << Lo12;);
201288
if (Lo20 && Hi12) {
202289
LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;);
203290
}
204291
}
205292

206293
// Detect patterns for large offsets that are passed into an ADD instruction.
207-
// If the pattern is found, updates the offset in Hi20, Lo12, Lo20 and Hi12
208-
// instructions and deletes TailAdd and the instructions that produced the
209-
// offset.
294+
// If the pattern is found, updates the offset in Hi20, (Add), Lo12,
295+
// (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that
296+
// produced the offset.
210297
//
211298
// (The instructions marked with "!" are not necessarily present)
212299
//
213300
// Base address lowering is of the form:
214-
// Hi20: pcalau12i vreg1, %pc_hi20(s)
215-
// +- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
216-
// | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
217-
// +- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
301+
// 1) pcala:
302+
// Hi20: pcalau12i vreg1, %pc_hi20(s)
303+
// +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
304+
// | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
305+
// +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
306+
// |
307+
// | 2) tls-le:
308+
// | Hi20: lu12i.w vreg1, %le_hi20_r(s)
309+
// | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s)
310+
// +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s)
218311
// |
219312
// | The large offset can be one of the forms:
220313
// |
@@ -334,7 +427,8 @@ bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
334427

335428
// Look for arithmetic instructions we can get an offset from.
336429
// We might be able to remove the arithmetic instructions by folding the
337-
// offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I).
430+
// offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or
431+
// LU12I_W+PseudoAddTPRel+ADDI.
338432
if (!MRI->hasOneUse(DestReg))
339433
return false;
340434

@@ -454,6 +548,7 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
454548
// If all the uses are memory ops with the same offset, we can transform:
455549
//
456550
// 1. (small/medium):
551+
// 1.1. pcala
457552
// pcalau12i vreg1, %pc_hi20(s)
458553
// addi.d vreg2, vreg1, %pc_lo12(s)
459554
// ld.w vreg3, 8(vreg2)
@@ -463,6 +558,18 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
463558
// pcalau12i vreg1, %pc_hi20(s+8)
464559
// ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1)
465560
//
561+
// 1.2. tls-le
562+
// lu12i.w vreg1, %le_hi20_r(s)
563+
// add.w/d vreg2, vreg1, r2, %le_add_r(s)
564+
// addi.w/d vreg3, vreg2, %le_lo12_r(s)
565+
// ld.w vreg4, 8(vreg3)
566+
//
567+
// =>
568+
//
569+
// lu12i.w vreg1, %le_hi20_r(s+8)
570+
// add.w/d vreg2, vreg1, r2, %le_add_r(s+8)
571+
// ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2)
572+
//
466573
// 2. (large):
467574
// pcalau12i vreg1, %pc_hi20(s)
468575
// addi.d vreg2, $zero, %pc_lo12(s)
@@ -598,7 +705,8 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
598705
return false;
599706

600707
// If optimized by this pass successfully, MO_RELAX bitmask target-flag should
601-
// be removed from the code sequence.
708+
// be removed from the pcala code sequence. Code sequence of tls-le can still
709+
// be relaxed after being optimized.
602710
//
603711
// For example:
604712
// pcalau12i $a0, %pc_hi20(symbol)
@@ -614,15 +722,20 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
614722
// optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
615723
// carried by them.
616724
Hi20.getOperand(1).setOffset(NewOffset);
617-
Hi20.getOperand(1).setTargetFlags(
618-
LoongArchII::getDirectFlags(Hi20.getOperand(1)));
619725
MachineOperand &ImmOp = Lo12.getOperand(2);
620726
ImmOp.setOffset(NewOffset);
621-
ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
622727
if (Lo20 && Hi12) {
623728
Lo20->getOperand(2).setOffset(NewOffset);
624729
Hi12->getOperand(2).setOffset(NewOffset);
625730
}
731+
if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
732+
Hi20.getOperand(1).setTargetFlags(
733+
LoongArchII::getDirectFlags(Hi20.getOperand(1)));
734+
ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
735+
} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
736+
MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
737+
Add->getOperand(3).setOffset(NewOffset);
738+
}
626739

627740
// Update the immediate in the load/store instructions to add the offset.
628741
const LoongArchInstrInfo &TII = *ST->getInstrInfo();
@@ -673,7 +786,14 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
673786
return true;
674787
}
675788

676-
MRI->replaceRegWith(Lo12.getOperand(0).getReg(), Hi20.getOperand(0).getReg());
789+
if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
790+
MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
791+
Hi20.getOperand(0).getReg());
792+
} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
793+
MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
794+
MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
795+
Add->getOperand(0).getReg());
796+
}
677797
Lo12.eraseFromParent();
678798
return true;
679799
}
@@ -693,8 +813,21 @@ bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
693813
MachineInstr *Lo20 = nullptr;
694814
MachineInstr *Hi12 = nullptr;
695815
MachineInstr *Last = nullptr;
696-
if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
816+
if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
817+
// Detect foldable pcala code sequence in small/medium/large code model.
818+
if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
819+
continue;
820+
} else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
821+
MachineInstr *Add = nullptr;
822+
// Detect foldable tls-le code sequence in small/medium code model.
823+
if (!detectFoldable(Hi20, Add, Lo12))
824+
continue;
825+
} else {
697826
continue;
827+
}
828+
// For tls-le, we do not pass the second PseudoAddTPRel instr in order to
829+
// reuse the existing hooks and the last three paramaters should always be
830+
// nullptr.
698831
MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);
699832
MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);
700833
}

llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -317,11 +317,10 @@ define void @test_la_tls_le(i32 signext %n) {
317317
; LA32-NEXT: move $a1, $zero
318318
; LA32-NEXT: lu12i.w $a2, %le_hi20_r(le)
319319
; LA32-NEXT: add.w $a2, $a2, $tp, %le_add_r(le)
320-
; LA32-NEXT: addi.w $a2, $a2, %le_lo12_r(le)
321320
; LA32-NEXT: .p2align 4, , 16
322321
; LA32-NEXT: .LBB4_1: # %loop
323322
; LA32-NEXT: # =>This Inner Loop Header: Depth=1
324-
; LA32-NEXT: ld.w $zero, $a2, 0
323+
; LA32-NEXT: ld.w $zero, $a2, %le_lo12_r(le)
325324
; LA32-NEXT: addi.w $a1, $a1, 1
326325
; LA32-NEXT: blt $a1, $a0, .LBB4_1
327326
; LA32-NEXT: # %bb.2: # %ret
@@ -332,11 +331,10 @@ define void @test_la_tls_le(i32 signext %n) {
332331
; LA64-NEXT: move $a1, $zero
333332
; LA64-NEXT: lu12i.w $a2, %le_hi20_r(le)
334333
; LA64-NEXT: add.d $a2, $a2, $tp, %le_add_r(le)
335-
; LA64-NEXT: addi.d $a2, $a2, %le_lo12_r(le)
336334
; LA64-NEXT: .p2align 4, , 16
337335
; LA64-NEXT: .LBB4_1: # %loop
338336
; LA64-NEXT: # =>This Inner Loop Header: Depth=1
339-
; LA64-NEXT: ld.w $zero, $a2, 0
337+
; LA64-NEXT: ld.w $zero, $a2, %le_lo12_r(le)
340338
; LA64-NEXT: addi.w $a1, $a1, 1
341339
; LA64-NEXT: blt $a1, $a0, .LBB4_1
342340
; LA64-NEXT: # %bb.2: # %ret

0 commit comments

Comments
 (0)