-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LoongArch] Merge base and offset for tls-le code sequence #122999
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This commit add relax relocations for tls_le code sequence. Handwritten assembly and generating source code by clang are both affected. Scheduled tls_le code sequence can be relaxed normally. So we can add relax relocations when code emitting according to their relocs. Other relaxable macros' code sequence cannot be scheduled when relaxation enabled. Attaching relax relocations for them will be implemented in later commit.
… relocs If linker relaxation enabled, relaxable code sequence expanded from pseudos should avoid being separated by instruction scheduling. This commit tags scheduling boundary for them to avoid being scheduled. (Except for `tls_le` and `call36/tail36`. Because `tls_le` can be scheduled and have no influence to relax, `call36/tail36` are expanded later in `LoongArchExpandPseudo` pass.) A new mask target-flag is added to attach relax relocs to the relaxable code sequence. (No need to add it for `tls_le` and `call36/tail36` because of the reasons shown above.) Because of this, get "direct" flags is necessary when using their target-flags. In addition, code sequence after being optimized by `MergeBaseOffset` pass may not relaxable any more, so the relax "bitmask" flag should be removed.
Similar to tests in `merge-base-offset.ll`, except for tests of blockaddress. A later commit will optimize this.
@llvm/pr-subscribers-backend-loongarch Author: ZhaoQi (zhaoqi5) ChangesAdapt the merge base offset pass to optimize the tls-le code sequence. Patch is 43.81 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/122999.diff 3 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp b/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
index 7f98f7718a538d..19bb2c2f52b0cf 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchMergeBaseOffset.cpp
@@ -37,6 +37,8 @@ class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Lo12,
MachineInstr *&Lo20, MachineInstr *&Hi12,
MachineInstr *&Last);
+ bool detectFoldable(MachineInstr &Hi20, MachineInstr *&Add,
+ MachineInstr *&Lo12);
bool detectAndFoldOffset(MachineInstr &Hi20, MachineInstr &Lo12,
MachineInstr *&Lo20, MachineInstr *&Hi12,
@@ -176,7 +178,80 @@ bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
return true;
}
-// Update the offset in Hi20, Lo12, Lo20 and Hi12 instructions.
+// Detect the pattern:
+//
+// (small/medium):
+// lu12i.w vreg1, %le_hi20_r(s)
+// add.w/d vreg2, vreg1, r2, %le_add_r(s)
+// addi.w/d vreg3, vreg2, %le_lo12_r(s)
+
+// The pattern is only accepted if:
+// 1) The first instruction has only one use, which is the PseudoAddTPRel.
+// The second instruction has only one use, which is the ADDI. The
+// second instruction's last operand is the tp register.
+// 2) The address operands have the appropriate type, reflecting the
+// lowering of a thread_local global address using the pattern.
+// 3) The offset value in the ThreadLocal Global Address is 0.
+bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
+ MachineInstr *&Add,
+ MachineInstr *&Lo12) {
+ if (Hi20.getOpcode() != LoongArch::LU12I_W)
+ return false;
+
+ auto isGlobalOrCPI = [](const MachineOperand &Op) {
+ return Op.isGlobal() || Op.isCPI();
+ };
+
+ const MachineOperand &Hi20Op1 = Hi20.getOperand(1);
+ if (LoongArchII::getDirectFlags(Hi20Op1) != LoongArchII::MO_LE_HI_R ||
+ !isGlobalOrCPI(Hi20Op1) || Hi20Op1.getOffset() != 0)
+ return false;
+
+ Register HiDestReg = Hi20.getOperand(0).getReg();
+ if (!MRI->hasOneUse(HiDestReg))
+ return false;
+
+ Add = &*MRI->use_instr_begin(HiDestReg);
+ if ((ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_D) ||
+ (!ST->is64Bit() && Add->getOpcode() != LoongArch::PseudoAddTPRel_W))
+ return false;
+
+ if (Add->getOperand(2).getReg() != LoongArch::R2)
+ return false;
+
+ const MachineOperand &AddOp3 = Add->getOperand(3);
+ if (LoongArchII::getDirectFlags(AddOp3) != LoongArchII::MO_LE_ADD_R ||
+ !(isGlobalOrCPI(AddOp3) || AddOp3.isMCSymbol()) ||
+ AddOp3.getOffset() != 0)
+ return false;
+
+ Register AddDestReg = Add->getOperand(0).getReg();
+ if (!MRI->hasOneUse(AddDestReg))
+ return false;
+
+ Lo12 = &*MRI->use_instr_begin(AddDestReg);
+ if ((ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_D) ||
+ (!ST->is64Bit() && Lo12->getOpcode() != LoongArch::ADDI_W))
+ return false;
+
+ const MachineOperand &Lo12Op2 = Lo12->getOperand(2);
+ if (LoongArchII::getDirectFlags(Lo12Op2) != LoongArchII::MO_LE_LO_R ||
+ !(isGlobalOrCPI(Lo12Op2) || Lo12Op2.isMCSymbol()) ||
+ Lo12Op2.getOffset() != 0)
+ return false;
+
+ if (Hi20Op1.isGlobal()) {
+ LLVM_DEBUG(dbgs() << " Found lowered global address: "
+ << *Hi20Op1.getGlobal() << "\n");
+ } else if (Hi20Op1.isCPI()) {
+ LLVM_DEBUG(dbgs() << " Found lowered constant pool: " << Hi20Op1.getIndex()
+ << "\n");
+ }
+
+ return true;
+}
+
+// Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions.
// Delete the tail instruction and update all the uses to use the
// output from Last.
void LoongArchMergeBaseOffsetOpt::foldOffset(
@@ -190,31 +265,49 @@ void LoongArchMergeBaseOffsetOpt::foldOffset(
Lo20->getOperand(2).setOffset(Offset);
Hi12->getOperand(2).setOffset(Offset);
}
+
+ // For tls-le, offset of the second PseudoAddTPRel instr should also be
+ // updated.
+ MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
+ if (Hi20.getOpcode() == LoongArch::LU12I_W)
+ Add->getOperand(3).setOffset(Offset);
+
// Delete the tail instruction.
MachineInstr *Def = Last ? Last : &Lo12;
MRI->constrainRegClass(Def->getOperand(0).getReg(),
MRI->getRegClass(Tail.getOperand(0).getReg()));
MRI->replaceRegWith(Tail.getOperand(0).getReg(), Def->getOperand(0).getReg());
Tail.eraseFromParent();
+
LLVM_DEBUG(dbgs() << " Merged offset " << Offset << " into base.\n"
- << " " << Hi20 << " " << Lo12;);
+ << " " << Hi20;);
+ if (Hi20.getOpcode() == LoongArch::LU12I_W) {
+ LLVM_DEBUG(dbgs() << " " << *Add;);
+ }
+ LLVM_DEBUG(dbgs() << " " << Lo12;);
if (Lo20 && Hi12) {
LLVM_DEBUG(dbgs() << " " << *Lo20 << " " << *Hi12;);
}
}
// Detect patterns for large offsets that are passed into an ADD instruction.
-// If the pattern is found, updates the offset in Hi20, Lo12, Lo20 and Hi12
-// instructions and deletes TailAdd and the instructions that produced the
-// offset.
+// If the pattern is found, updates the offset in Hi20, (Add), Lo12,
+// (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that
+// produced the offset.
//
// (The instructions marked with "!" are not necessarily present)
//
// Base address lowering is of the form:
-// Hi20: pcalau12i vreg1, %pc_hi20(s)
-// +- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
-// | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
-// +- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
+// 1) pcala:
+// Hi20: pcalau12i vreg1, %pc_hi20(s)
+// +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
+// | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
+// +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
+// |
+// | 2) tls-le:
+// | Hi20: lu12i.w vreg1, %le_hi20_r(s)
+// | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s)
+// +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s)
// |
// | The large offset can be one of the forms:
// |
@@ -334,7 +427,8 @@ bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
// Look for arithmetic instructions we can get an offset from.
// We might be able to remove the arithmetic instructions by folding the
- // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I).
+ // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or
+ // LU12I_W+PseudoAddTPRel+ADDI.
if (!MRI->hasOneUse(DestReg))
return false;
@@ -454,6 +548,7 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
// If all the uses are memory ops with the same offset, we can transform:
//
// 1. (small/medium):
+ // 1.1. pcala
// pcalau12i vreg1, %pc_hi20(s)
// addi.d vreg2, vreg1, %pc_lo12(s)
// ld.w vreg3, 8(vreg2)
@@ -463,6 +558,18 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
// pcalau12i vreg1, %pc_hi20(s+8)
// ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1)
//
+ // 1.2. tls-le
+ // lu12i.w vreg1, %le_hi20_r(s)
+ // add.w/d vreg2, vreg1, r2, %le_add_r(s)
+ // addi.w/d vreg3, vreg2, %le_lo12_r(s)
+ // ld.w vreg4, 8(vreg3)
+ //
+ // =>
+ //
+ // lu12i.w vreg1, %le_hi20_r(s+8)
+ // add.w/d vreg2, vreg1, r2, %le_add_r(s+8)
+ // ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2)
+ //
// 2. (large):
// pcalau12i vreg1, %pc_hi20(s)
// addi.d vreg2, $zero, %pc_lo12(s)
@@ -598,7 +705,8 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
return false;
// If optimized by this pass successfully, MO_RELAX bitmask target-flag should
- // be removed from the code sequence.
+ // be removed from the pcala code sequence. Code sequence of tls-le can still
+ // be relaxed after being optimized.
//
// For example:
// pcalau12i $a0, %pc_hi20(symbol)
@@ -614,15 +722,20 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
// optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
// carried by them.
Hi20.getOperand(1).setOffset(NewOffset);
- Hi20.getOperand(1).setTargetFlags(
- LoongArchII::getDirectFlags(Hi20.getOperand(1)));
MachineOperand &ImmOp = Lo12.getOperand(2);
ImmOp.setOffset(NewOffset);
- ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
if (Lo20 && Hi12) {
Lo20->getOperand(2).setOffset(NewOffset);
Hi12->getOperand(2).setOffset(NewOffset);
}
+ if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
+ Hi20.getOperand(1).setTargetFlags(
+ LoongArchII::getDirectFlags(Hi20.getOperand(1)));
+ ImmOp.setTargetFlags(LoongArchII::getDirectFlags(ImmOp));
+ } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
+ MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
+ Add->getOperand(3).setOffset(NewOffset);
+ }
// Update the immediate in the load/store instructions to add the offset.
const LoongArchInstrInfo &TII = *ST->getInstrInfo();
@@ -673,7 +786,14 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
return true;
}
- MRI->replaceRegWith(Lo12.getOperand(0).getReg(), Hi20.getOperand(0).getReg());
+ if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
+ MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
+ Hi20.getOperand(0).getReg());
+ } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
+ MachineInstr *Add = &*MRI->use_instr_begin(Hi20.getOperand(0).getReg());
+ MRI->replaceRegWith(Lo12.getOperand(0).getReg(),
+ Add->getOperand(0).getReg());
+ }
Lo12.eraseFromParent();
return true;
}
@@ -689,14 +809,27 @@ bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
for (MachineBasicBlock &MBB : Fn) {
LLVM_DEBUG(dbgs() << "MBB: " << MBB.getName() << "\n");
for (MachineInstr &Hi20 : MBB) {
- MachineInstr *Lo12 = nullptr;
- MachineInstr *Lo20 = nullptr;
- MachineInstr *Hi12 = nullptr;
- MachineInstr *Last = nullptr;
- if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
- continue;
- MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);
- MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);
+ if (Hi20.getOpcode() == LoongArch::PCALAU12I) {
+ MachineInstr *Lo12 = nullptr;
+ MachineInstr *Lo20 = nullptr;
+ MachineInstr *Hi12 = nullptr;
+ MachineInstr *Last = nullptr;
+ if (!detectFoldable(Hi20, Lo12, Lo20, Hi12, Last))
+ continue;
+ MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Lo20, Hi12, Last);
+ MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Lo20, Hi12, Last);
+ } else if (Hi20.getOpcode() == LoongArch::LU12I_W) {
+ MachineInstr *Add = nullptr;
+ MachineInstr *Lo12 = nullptr;
+ MachineInstr *Tmp = nullptr;
+ if (!detectFoldable(Hi20, Add, Lo12))
+ continue;
+ // In order to reuse the existing hooks, we do not pass the second
+ // PseudoAddTPRel instr and the last three paramaters should always be
+ // nullptr.
+ MadeChange |= detectAndFoldOffset(Hi20, *Lo12, Tmp, Tmp, Tmp);
+ MadeChange |= foldIntoMemoryOps(Hi20, *Lo12, Tmp, Tmp, Tmp);
+ }
}
}
diff --git a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
index e0a93e3051bf88..92d079ab3a8d87 100644
--- a/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
+++ b/llvm/test/CodeGen/LoongArch/machinelicm-address-pseudos.ll
@@ -317,11 +317,10 @@ define void @test_la_tls_le(i32 signext %n) {
; LA32-NEXT: move $a1, $zero
; LA32-NEXT: lu12i.w $a2, %le_hi20_r(le)
; LA32-NEXT: add.w $a2, $a2, $tp, %le_add_r(le)
-; LA32-NEXT: addi.w $a2, $a2, %le_lo12_r(le)
; LA32-NEXT: .p2align 4, , 16
; LA32-NEXT: .LBB4_1: # %loop
; LA32-NEXT: # =>This Inner Loop Header: Depth=1
-; LA32-NEXT: ld.w $zero, $a2, 0
+; LA32-NEXT: ld.w $zero, $a2, %le_lo12_r(le)
; LA32-NEXT: addi.w $a1, $a1, 1
; LA32-NEXT: blt $a1, $a0, .LBB4_1
; LA32-NEXT: # %bb.2: # %ret
@@ -332,11 +331,10 @@ define void @test_la_tls_le(i32 signext %n) {
; LA64-NEXT: move $a1, $zero
; LA64-NEXT: lu12i.w $a2, %le_hi20_r(le)
; LA64-NEXT: add.d $a2, $a2, $tp, %le_add_r(le)
-; LA64-NEXT: addi.d $a2, $a2, %le_lo12_r(le)
; LA64-NEXT: .p2align 4, , 16
; LA64-NEXT: .LBB4_1: # %loop
; LA64-NEXT: # =>This Inner Loop Header: Depth=1
-; LA64-NEXT: ld.w $zero, $a2, 0
+; LA64-NEXT: ld.w $zero, $a2, %le_lo12_r(le)
; LA64-NEXT: addi.w $a1, $a1, 1
; LA64-NEXT: blt $a1, $a0, .LBB4_1
; LA64-NEXT: # %bb.2: # %ret
diff --git a/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll b/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll
index 7e995d224ce1d2..9ed9a865ce55d4 100644
--- a/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll
+++ b/llvm/test/CodeGen/LoongArch/merge-base-offset-tlsle.ll
@@ -11,16 +11,14 @@ define dso_local signext i8 @tlsle_load_s8() nounwind {
; LA32: # %bb.0: # %entry
; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8)
; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8)
-; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8)
-; LA32-NEXT: ld.b $a0, $a0, 0
+; LA32-NEXT: ld.b $a0, $a0, %le_lo12_r(g_i8)
; LA32-NEXT: ret
;
; LA64-LABEL: tlsle_load_s8:
; LA64: # %bb.0: # %entry
; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8)
; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8)
-; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8)
-; LA64-NEXT: ld.b $a0, $a0, 0
+; LA64-NEXT: ld.b $a0, $a0, %le_lo12_r(g_i8)
; LA64-NEXT: ret
entry:
%0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8)
@@ -33,16 +31,14 @@ define dso_local zeroext i8 @tlsle_load_u8() nounwind {
; LA32: # %bb.0: # %entry
; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8)
; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8)
-; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8)
-; LA32-NEXT: ld.bu $a0, $a0, 0
+; LA32-NEXT: ld.bu $a0, $a0, %le_lo12_r(g_i8)
; LA32-NEXT: ret
;
; LA64-LABEL: tlsle_load_u8:
; LA64: # %bb.0: # %entry
; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8)
; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8)
-; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8)
-; LA64-NEXT: ld.bu $a0, $a0, 0
+; LA64-NEXT: ld.bu $a0, $a0, %le_lo12_r(g_i8)
; LA64-NEXT: ret
entry:
%0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8)
@@ -55,18 +51,16 @@ define dso_local void @tlsle_store_i8() nounwind {
; LA32: # %bb.0: # %entry
; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i8)
; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i8)
-; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i8)
; LA32-NEXT: ori $a1, $zero, 1
-; LA32-NEXT: st.b $a1, $a0, 0
+; LA32-NEXT: st.b $a1, $a0, %le_lo12_r(g_i8)
; LA32-NEXT: ret
;
; LA64-LABEL: tlsle_store_i8:
; LA64: # %bb.0: # %entry
; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i8)
; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i8)
-; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i8)
; LA64-NEXT: ori $a1, $zero, 1
-; LA64-NEXT: st.b $a1, $a0, 0
+; LA64-NEXT: st.b $a1, $a0, %le_lo12_r(g_i8)
; LA64-NEXT: ret
entry:
%0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i8)
@@ -81,16 +75,14 @@ define dso_local signext i16 @tlsle_load_s16() nounwind {
; LA32: # %bb.0: # %entry
; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i16)
; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i16)
-; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i16)
-; LA32-NEXT: ld.h $a0, $a0, 0
+; LA32-NEXT: ld.h $a0, $a0, %le_lo12_r(g_i16)
; LA32-NEXT: ret
;
; LA64-LABEL: tlsle_load_s16:
; LA64: # %bb.0: # %entry
; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i16)
; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i16)
-; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i16)
-; LA64-NEXT: ld.h $a0, $a0, 0
+; LA64-NEXT: ld.h $a0, $a0, %le_lo12_r(g_i16)
; LA64-NEXT: ret
entry:
%0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i16)
@@ -103,16 +95,14 @@ define dso_local zeroext i16 @tlsle_load_u16() nounwind {
; LA32: # %bb.0: # %entry
; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i16)
; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i16)
-; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i16)
-; LA32-NEXT: ld.hu $a0, $a0, 0
+; LA32-NEXT: ld.hu $a0, $a0, %le_lo12_r(g_i16)
; LA32-NEXT: ret
;
; LA64-LABEL: tlsle_load_u16:
; LA64: # %bb.0: # %entry
; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i16)
; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i16)
-; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i16)
-; LA64-NEXT: ld.hu $a0, $a0, 0
+; LA64-NEXT: ld.hu $a0, $a0, %le_lo12_r(g_i16)
; LA64-NEXT: ret
entry:
%0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i16)
@@ -125,18 +115,16 @@ define dso_local void @tlsle_store_i16() nounwind {
; LA32: # %bb.0: # %entry
; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i16)
; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i16)
-; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i16)
; LA32-NEXT: ori $a1, $zero, 1
-; LA32-NEXT: st.h $a1, $a0, 0
+; LA32-NEXT: st.h $a1, $a0, %le_lo12_r(g_i16)
; LA32-NEXT: ret
;
; LA64-LABEL: tlsle_store_i16:
; LA64: # %bb.0: # %entry
; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i16)
; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i16)
-; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i16)
; LA64-NEXT: ori $a1, $zero, 1
-; LA64-NEXT: st.h $a1, $a0, 0
+; LA64-NEXT: st.h $a1, $a0, %le_lo12_r(g_i16)
; LA64-NEXT: ret
entry:
%0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i16)
@@ -151,16 +139,14 @@ define dso_local signext i32 @tlsle_load_s32() nounwind {
; LA32: # %bb.0: # %entry
; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32)
; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32)
-; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32)
-; LA32-NEXT: ld.w $a0, $a0, 0
+; LA32-NEXT: ld.w $a0, $a0, %le_lo12_r(g_i32)
; LA32-NEXT: ret
;
; LA64-LABEL: tlsle_load_s32:
; LA64: # %bb.0: # %entry
; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32)
; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32)
-; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32)
-; LA64-NEXT: ld.w $a0, $a0, 0
+; LA64-NEXT: ld.w $a0, $a0, %le_lo12_r(g_i32)
; LA64-NEXT: ret
entry:
%0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32)
@@ -173,16 +159,14 @@ define dso_local zeroext i32 @tlsle_load_u32() nounwind {
; LA32: # %bb.0: # %entry
; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32)
; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32)
-; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32)
-; LA32-NEXT: ld.w $a0, $a0, 0
+; LA32-NEXT: ld.w $a0, $a0, %le_lo12_r(g_i32)
; LA32-NEXT: ret
;
; LA64-LABEL: tlsle_load_u32:
; LA64: # %bb.0: # %entry
; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32)
; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32)
-; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32)
-; LA64-NEXT: ld.wu $a0, $a0, 0
+; LA64-NEXT: ld.wu $a0, $a0, %le_lo12_r(g_i32)
; LA64-NEXT: ret
entry:
%0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32)
@@ -195,18 +179,16 @@ define dso_local void @tlsle_store_i32() nounwind {
; LA32: # %bb.0: # %entry
; LA32-NEXT: lu12i.w $a0, %le_hi20_r(g_i32)
; LA32-NEXT: add.w $a0, $a0, $tp, %le_add_r(g_i32)
-; LA32-NEXT: addi.w $a0, $a0, %le_lo12_r(g_i32)
; LA32-NEXT: ori $a1, $zero, 1
-; LA32-NEXT: st.w $a1, $a0, 0
+; LA32-NEXT: st.w $a1, $a0, %le_lo12_r(g_i32)
; LA32-NEXT: ret
;
; LA64-LABEL: tlsle_store_i32:
; LA64: # %bb.0: # %entry
; LA64-NEXT: lu12i.w $a0, %le_hi20_r(g_i32)
; LA64-NEXT: add.d $a0, $a0, $tp, %le_add_r(g_i32)
-; LA64-NEXT: addi.d $a0, $a0, %le_lo12_r(g_i32)
; LA64-NEXT: ori $a1, $zero, 1
-; LA64-NEXT: st.w $a1, $a0, 0
+; LA64-NEXT: st.w $a1, $a0, %le_lo12_r(g_i32)
; LA64-NEXT: ret
entry:
%0 = call ptr @llvm.threadlocal.address.p0(ptr @g_i32)
@@ -230,8 +212,7 @@ define dso_local i64 @tlsle_load_i64() noun...
[truncated]
|
12190a5
to
ac63a4f
Compare
Adapt the merge base offset pass to optimize the tls-le code sequence.
ac63a4f
to
e37c293
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Thanks
Adapt the merge base offset pass to optimize the tls-le code sequence.
Adapt the merge base offset pass to optimize the tls-le code sequence.