@@ -37,6 +37,8 @@ class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
37
37
bool detectFoldable (MachineInstr &Hi20, MachineInstr *&Lo12,
38
38
MachineInstr *&Lo20, MachineInstr *&Hi12,
39
39
MachineInstr *&Last);
40
+ bool detectFoldable (MachineInstr &Hi20, MachineInstr *&Add,
41
+ MachineInstr *&Lo12);
40
42
41
43
bool detectAndFoldOffset (MachineInstr &Hi20, MachineInstr &Lo12,
42
44
MachineInstr *&Lo20, MachineInstr *&Hi12,
@@ -176,7 +178,80 @@ bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
176
178
return true ;
177
179
}
178
180
179
- // Update the offset in Hi20, Lo12, Lo20 and Hi12 instructions.
181
+ // Detect the pattern:
182
+ //
183
+ // (small/medium):
184
+ // lu12i.w vreg1, %le_hi20_r(s)
185
+ // add.w/d vreg2, vreg1, r2, %le_add_r(s)
186
+ // addi.w/d vreg3, vreg2, %le_lo12_r(s)
187
+
188
+ // The pattern is only accepted if:
189
+ // 1) The first instruction has only one use, which is the PseudoAddTPRel.
190
+ // The second instruction has only one use, which is the ADDI. The
191
+ // second instruction's last operand is the tp register.
192
+ // 2) The address operands have the appropriate type, reflecting the
193
+ // lowering of a thread_local global address using the pattern.
194
+ // 3) The offset value in the ThreadLocal Global Address is 0.
195
+ bool LoongArchMergeBaseOffsetOpt::detectFoldable (MachineInstr &Hi20,
196
+ MachineInstr *&Add,
197
+ MachineInstr *&Lo12) {
198
+ if (Hi20.getOpcode () != LoongArch::LU12I_W)
199
+ return false ;
200
+
201
+ auto isGlobalOrCPI = [](const MachineOperand &Op) {
202
+ return Op.isGlobal () || Op.isCPI ();
203
+ };
204
+
205
+ const MachineOperand &Hi20Op1 = Hi20.getOperand (1 );
206
+ if (LoongArchII::getDirectFlags (Hi20Op1) != LoongArchII::MO_LE_HI_R ||
207
+ !isGlobalOrCPI (Hi20Op1) || Hi20Op1.getOffset () != 0 )
208
+ return false ;
209
+
210
+ Register HiDestReg = Hi20.getOperand (0 ).getReg ();
211
+ if (!MRI->hasOneUse (HiDestReg))
212
+ return false ;
213
+
214
+ Add = &*MRI->use_instr_begin (HiDestReg);
215
+ if ((ST->is64Bit () && Add->getOpcode () != LoongArch::PseudoAddTPRel_D) ||
216
+ (!ST->is64Bit () && Add->getOpcode () != LoongArch::PseudoAddTPRel_W))
217
+ return false ;
218
+
219
+ if (Add->getOperand (2 ).getReg () != LoongArch::R2)
220
+ return false ;
221
+
222
+ const MachineOperand &AddOp3 = Add->getOperand (3 );
223
+ if (LoongArchII::getDirectFlags (AddOp3) != LoongArchII::MO_LE_ADD_R ||
224
+ !(isGlobalOrCPI (AddOp3) || AddOp3.isMCSymbol ()) ||
225
+ AddOp3.getOffset () != 0 )
226
+ return false ;
227
+
228
+ Register AddDestReg = Add->getOperand (0 ).getReg ();
229
+ if (!MRI->hasOneUse (AddDestReg))
230
+ return false ;
231
+
232
+ Lo12 = &*MRI->use_instr_begin (AddDestReg);
233
+ if ((ST->is64Bit () && Lo12->getOpcode () != LoongArch::ADDI_D) ||
234
+ (!ST->is64Bit () && Lo12->getOpcode () != LoongArch::ADDI_W))
235
+ return false ;
236
+
237
+ const MachineOperand &Lo12Op2 = Lo12->getOperand (2 );
238
+ if (LoongArchII::getDirectFlags (Lo12Op2) != LoongArchII::MO_LE_LO_R ||
239
+ !(isGlobalOrCPI (Lo12Op2) || Lo12Op2.isMCSymbol ()) ||
240
+ Lo12Op2.getOffset () != 0 )
241
+ return false ;
242
+
243
+ if (Hi20Op1.isGlobal ()) {
244
+ LLVM_DEBUG (dbgs () << " Found lowered global address: "
245
+ << *Hi20Op1.getGlobal () << " \n " );
246
+ } else if (Hi20Op1.isCPI ()) {
247
+ LLVM_DEBUG (dbgs () << " Found lowered constant pool: " << Hi20Op1.getIndex ()
248
+ << " \n " );
249
+ }
250
+
251
+ return true ;
252
+ }
253
+
254
+ // Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions.
180
255
// Delete the tail instruction and update all the uses to use the
181
256
// output from Last.
182
257
void LoongArchMergeBaseOffsetOpt::foldOffset (
@@ -190,31 +265,49 @@ void LoongArchMergeBaseOffsetOpt::foldOffset(
190
265
Lo20->getOperand (2 ).setOffset (Offset);
191
266
Hi12->getOperand (2 ).setOffset (Offset);
192
267
}
268
+
269
+ // For tls-le, offset of the second PseudoAddTPRel instr should also be
270
+ // updated.
271
+ MachineInstr *Add = &*MRI->use_instr_begin (Hi20.getOperand (0 ).getReg ());
272
+ if (Hi20.getOpcode () == LoongArch::LU12I_W)
273
+ Add->getOperand (3 ).setOffset (Offset);
274
+
193
275
// Delete the tail instruction.
194
276
MachineInstr *Def = Last ? Last : &Lo12;
195
277
MRI->constrainRegClass (Def->getOperand (0 ).getReg (),
196
278
MRI->getRegClass (Tail.getOperand (0 ).getReg ()));
197
279
MRI->replaceRegWith (Tail.getOperand (0 ).getReg (), Def->getOperand (0 ).getReg ());
198
280
Tail.eraseFromParent ();
281
+
199
282
LLVM_DEBUG (dbgs () << " Merged offset " << Offset << " into base.\n "
200
- << " " << Hi20 << " " << Lo12;);
283
+ << " " << Hi20;);
284
+ if (Hi20.getOpcode () == LoongArch::LU12I_W) {
285
+ LLVM_DEBUG (dbgs () << " " << *Add;);
286
+ }
287
+ LLVM_DEBUG (dbgs () << " " << Lo12;);
201
288
if (Lo20 && Hi12) {
202
289
LLVM_DEBUG (dbgs () << " " << *Lo20 << " " << *Hi12;);
203
290
}
204
291
}
205
292
206
293
// Detect patterns for large offsets that are passed into an ADD instruction.
207
- // If the pattern is found, updates the offset in Hi20, Lo12, Lo20 and Hi12
208
- // instructions and deletes TailAdd and the instructions that produced the
209
- // offset.
294
+ // If the pattern is found, updates the offset in Hi20, (Add), Lo12,
295
+ // (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that
296
+ // produced the offset.
210
297
//
211
298
// (The instructions marked with "!" are not necessarily present)
212
299
//
213
300
// Base address lowering is of the form:
214
- // Hi20: pcalau12i vreg1, %pc_hi20(s)
215
- // +- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
216
- // | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
217
- // +- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
301
+ // 1) pcala:
302
+ // Hi20: pcalau12i vreg1, %pc_hi20(s)
303
+ // +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
304
+ // | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
305
+ // +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
306
+ // |
307
+ // | 2) tls-le:
308
+ // | Hi20: lu12i.w vreg1, %le_hi20_r(s)
309
+ // | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s)
310
+ // +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s)
218
311
// |
219
312
// | The large offset can be one of the forms:
220
313
// |
@@ -334,7 +427,8 @@ bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
334
427
335
428
// Look for arithmetic instructions we can get an offset from.
336
429
// We might be able to remove the arithmetic instructions by folding the
337
- // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I).
430
+ // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or
431
+ // LU12I_W+PseudoAddTPRel+ADDI.
338
432
if (!MRI->hasOneUse (DestReg))
339
433
return false ;
340
434
@@ -454,6 +548,7 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
454
548
// If all the uses are memory ops with the same offset, we can transform:
455
549
//
456
550
// 1. (small/medium):
551
+ // 1.1. pcala
457
552
// pcalau12i vreg1, %pc_hi20(s)
458
553
// addi.d vreg2, vreg1, %pc_lo12(s)
459
554
// ld.w vreg3, 8(vreg2)
@@ -463,6 +558,18 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
463
558
// pcalau12i vreg1, %pc_hi20(s+8)
464
559
// ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1)
465
560
//
561
+ // 1.2. tls-le
562
+ // lu12i.w vreg1, %le_hi20_r(s)
563
+ // add.w/d vreg2, vreg1, r2, %le_add_r(s)
564
+ // addi.w/d vreg3, vreg2, %le_lo12_r(s)
565
+ // ld.w vreg4, 8(vreg3)
566
+ //
567
+ // =>
568
+ //
569
+ // lu12i.w vreg1, %le_hi20_r(s+8)
570
+ // add.w/d vreg2, vreg1, r2, %le_add_r(s+8)
571
+ // ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2)
572
+ //
466
573
// 2. (large):
467
574
// pcalau12i vreg1, %pc_hi20(s)
468
575
// addi.d vreg2, $zero, %pc_lo12(s)
@@ -598,7 +705,8 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
598
705
return false ;
599
706
600
707
// If optimized by this pass successfully, MO_RELAX bitmask target-flag should
601
- // be removed from the code sequence.
708
+ // be removed from the pcala code sequence. Code sequence of tls-le can still
709
+ // be relaxed after being optimized.
602
710
//
603
711
// For example:
604
712
// pcalau12i $a0, %pc_hi20(symbol)
@@ -614,15 +722,20 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
614
722
// optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
615
723
// carried by them.
616
724
Hi20.getOperand (1 ).setOffset (NewOffset);
617
- Hi20.getOperand (1 ).setTargetFlags (
618
- LoongArchII::getDirectFlags (Hi20.getOperand (1 )));
619
725
MachineOperand &ImmOp = Lo12.getOperand (2 );
620
726
ImmOp.setOffset (NewOffset);
621
- ImmOp.setTargetFlags (LoongArchII::getDirectFlags (ImmOp));
622
727
if (Lo20 && Hi12) {
623
728
Lo20->getOperand (2 ).setOffset (NewOffset);
624
729
Hi12->getOperand (2 ).setOffset (NewOffset);
625
730
}
731
+ if (Hi20.getOpcode () == LoongArch::PCALAU12I) {
732
+ Hi20.getOperand (1 ).setTargetFlags (
733
+ LoongArchII::getDirectFlags (Hi20.getOperand (1 )));
734
+ ImmOp.setTargetFlags (LoongArchII::getDirectFlags (ImmOp));
735
+ } else if (Hi20.getOpcode () == LoongArch::LU12I_W) {
736
+ MachineInstr *Add = &*MRI->use_instr_begin (Hi20.getOperand (0 ).getReg ());
737
+ Add->getOperand (3 ).setOffset (NewOffset);
738
+ }
626
739
627
740
// Update the immediate in the load/store instructions to add the offset.
628
741
const LoongArchInstrInfo &TII = *ST->getInstrInfo ();
@@ -673,7 +786,14 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
673
786
return true ;
674
787
}
675
788
676
- MRI->replaceRegWith (Lo12.getOperand (0 ).getReg (), Hi20.getOperand (0 ).getReg ());
789
+ if (Hi20.getOpcode () == LoongArch::PCALAU12I) {
790
+ MRI->replaceRegWith (Lo12.getOperand (0 ).getReg (),
791
+ Hi20.getOperand (0 ).getReg ());
792
+ } else if (Hi20.getOpcode () == LoongArch::LU12I_W) {
793
+ MachineInstr *Add = &*MRI->use_instr_begin (Hi20.getOperand (0 ).getReg ());
794
+ MRI->replaceRegWith (Lo12.getOperand (0 ).getReg (),
795
+ Add->getOperand (0 ).getReg ());
796
+ }
677
797
Lo12.eraseFromParent ();
678
798
return true ;
679
799
}
@@ -693,8 +813,21 @@ bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
693
813
MachineInstr *Lo20 = nullptr ;
694
814
MachineInstr *Hi12 = nullptr ;
695
815
MachineInstr *Last = nullptr ;
696
- if (!detectFoldable (Hi20, Lo12, Lo20, Hi12, Last))
816
+ if (Hi20.getOpcode () == LoongArch::PCALAU12I) {
817
+ // Detect foldable pcala code sequence in small/medium/large code model.
818
+ if (!detectFoldable (Hi20, Lo12, Lo20, Hi12, Last))
819
+ continue ;
820
+ } else if (Hi20.getOpcode () == LoongArch::LU12I_W) {
821
+ MachineInstr *Add = nullptr ;
822
+ // Detect foldable tls-le code sequence in small/medium code model.
823
+ if (!detectFoldable (Hi20, Add, Lo12))
824
+ continue ;
825
+ } else {
697
826
continue ;
827
+ }
828
+ // For tls-le, we do not pass the second PseudoAddTPRel instr in order to
829
+ // reuse the existing hooks and the last three paramaters should always be
830
+ // nullptr.
698
831
MadeChange |= detectAndFoldOffset (Hi20, *Lo12, Lo20, Hi12, Last);
699
832
MadeChange |= foldIntoMemoryOps (Hi20, *Lo12, Lo20, Hi12, Last);
700
833
}
0 commit comments