Skip to content

Commit 203ba23

Browse files
authored
[LoongArch] Improve codegen for atomic ops (#67391)
This PR improves memory barriers generated by atomic operations. Memory barrier semantics of LL/SC: ``` LL: <memory-barrier> + <load-exclusive> SC: <store-conditional> + <memory-barrier> ``` Changes: * Remove unnecessary memory barriers before LL and between LL/SC. * Fix acquire semantics. (If the SC instruction is not executed, then the guarantee of acquiring semantics cannot be ensured. Therefore, an acquire barrier needs to be generated when memory ordering includes an acquire operation.)
1 parent a157e79 commit 203ba23

File tree

8 files changed

+401
-152
lines changed

8 files changed

+401
-152
lines changed

llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp

Lines changed: 19 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -153,18 +153,12 @@ static void doAtomicBinOpExpansion(const LoongArchInstrInfo *TII,
153153
Register ScratchReg = MI.getOperand(1).getReg();
154154
Register AddrReg = MI.getOperand(2).getReg();
155155
Register IncrReg = MI.getOperand(3).getReg();
156-
AtomicOrdering Ordering =
157-
static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
158156

159157
// .loop:
160-
// if(Ordering != AtomicOrdering::Monotonic)
161-
// dbar 0
162158
// ll.[w|d] dest, (addr)
163159
// binop scratch, dest, val
164160
// sc.[w|d] scratch, scratch, (addr)
165161
// beqz scratch, loop
166-
if (Ordering != AtomicOrdering::Monotonic)
167-
BuildMI(LoopMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
168162
BuildMI(LoopMBB, DL,
169163
TII->get(Width == 32 ? LoongArch::LL_W : LoongArch::LL_D), DestReg)
170164
.addReg(AddrReg)
@@ -251,21 +245,15 @@ static void doMaskedAtomicBinOpExpansion(
251245
Register AddrReg = MI.getOperand(2).getReg();
252246
Register IncrReg = MI.getOperand(3).getReg();
253247
Register MaskReg = MI.getOperand(4).getReg();
254-
AtomicOrdering Ordering =
255-
static_cast<AtomicOrdering>(MI.getOperand(5).getImm());
256248

257249
// .loop:
258-
// if(Ordering != AtomicOrdering::Monotonic)
259-
// dbar 0
260250
// ll.w destreg, (alignedaddr)
261251
// binop scratch, destreg, incr
262252
// xor scratch, destreg, scratch
263253
// and scratch, scratch, masktargetdata
264254
// xor scratch, destreg, scratch
265255
// sc.w scratch, scratch, (alignedaddr)
266256
// beqz scratch, loop
267-
if (Ordering != AtomicOrdering::Monotonic)
268-
BuildMI(LoopMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
269257
BuildMI(LoopMBB, DL, TII->get(LoongArch::LL_W), DestReg)
270258
.addReg(AddrReg)
271259
.addImm(0);
@@ -372,23 +360,20 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
372360
auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
373361
auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
374362
auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
375-
auto TailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
376363
auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
377364

378365
// Insert new MBBs.
379366
MF->insert(++MBB.getIterator(), LoopHeadMBB);
380367
MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
381368
MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
382-
MF->insert(++LoopTailMBB->getIterator(), TailMBB);
383-
MF->insert(++TailMBB->getIterator(), DoneMBB);
369+
MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
384370

385371
// Set up successors and transfer remaining instructions to DoneMBB.
386372
LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
387373
LoopHeadMBB->addSuccessor(LoopTailMBB);
388374
LoopIfBodyMBB->addSuccessor(LoopTailMBB);
389375
LoopTailMBB->addSuccessor(LoopHeadMBB);
390-
LoopTailMBB->addSuccessor(TailMBB);
391-
TailMBB->addSuccessor(DoneMBB);
376+
LoopTailMBB->addSuccessor(DoneMBB);
392377
DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
393378
DoneMBB->transferSuccessors(&MBB);
394379
MBB.addSuccessor(LoopHeadMBB);
@@ -402,11 +387,9 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
402387

403388
//
404389
// .loophead:
405-
// dbar 0
406390
// ll.w destreg, (alignedaddr)
407391
// and scratch2, destreg, mask
408392
// move scratch1, destreg
409-
BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
410393
BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::LL_W), DestReg)
411394
.addReg(AddrReg)
412395
.addImm(0);
@@ -463,7 +446,6 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
463446
// .looptail:
464447
// sc.w scratch1, scratch1, (addr)
465448
// beqz scratch1, loop
466-
// dbar 0x700
467449
BuildMI(LoopTailMBB, DL, TII->get(LoongArch::SC_W), Scratch1Reg)
468450
.addReg(Scratch1Reg)
469451
.addReg(AddrReg)
@@ -472,18 +454,13 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
472454
.addReg(Scratch1Reg)
473455
.addMBB(LoopHeadMBB);
474456

475-
// .tail:
476-
// dbar 0x700
477-
BuildMI(TailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0x700);
478-
479457
NextMBBI = MBB.end();
480458
MI.eraseFromParent();
481459

482460
LivePhysRegs LiveRegs;
483461
computeAndAddLiveIns(LiveRegs, *LoopHeadMBB);
484462
computeAndAddLiveIns(LiveRegs, *LoopIfBodyMBB);
485463
computeAndAddLiveIns(LiveRegs, *LoopTailMBB);
486-
computeAndAddLiveIns(LiveRegs, *TailMBB);
487464
computeAndAddLiveIns(LiveRegs, *DoneMBB);
488465

489466
return true;
@@ -535,12 +512,10 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
535512
.addReg(CmpValReg)
536513
.addMBB(TailMBB);
537514
// .looptail:
538-
// dbar 0
539515
// move scratch, newval
540516
// sc.[w|d] scratch, scratch, (addr)
541517
// beqz scratch, loophead
542518
// b done
543-
BuildMI(LoopTailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
544519
BuildMI(LoopTailMBB, DL, TII->get(LoongArch::OR), ScratchReg)
545520
.addReg(NewValReg)
546521
.addReg(LoongArch::R0);
@@ -573,13 +548,11 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
573548
.addMBB(TailMBB);
574549

575550
// .looptail:
576-
// dbar 0
577551
// andn scratch, dest, mask
578552
// or scratch, scratch, newval
579553
// sc.[w|d] scratch, scratch, (addr)
580554
// beqz scratch, loophead
581555
// b done
582-
BuildMI(LoopTailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
583556
BuildMI(LoopTailMBB, DL, TII->get(LoongArch::ANDN), ScratchReg)
584557
.addReg(DestReg)
585558
.addReg(MaskReg);
@@ -598,9 +571,24 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
598571
BuildMI(LoopTailMBB, DL, TII->get(LoongArch::B)).addMBB(DoneMBB);
599572
}
600573

574+
AtomicOrdering Ordering =
575+
static_cast<AtomicOrdering>(MI.getOperand(IsMasked ? 6 : 5).getImm());
576+
int hint;
577+
578+
switch (Ordering) {
579+
case AtomicOrdering::Acquire:
580+
case AtomicOrdering::AcquireRelease:
581+
case AtomicOrdering::SequentiallyConsistent:
582+
// TODO: acquire
583+
hint = 0;
584+
break;
585+
default:
586+
hint = 0x700;
587+
}
588+
601589
// .tail:
602-
// dbar 0x700
603-
BuildMI(TailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0x700);
590+
// dbar 0x700 | acquire
591+
BuildMI(TailMBB, DL, TII->get(LoongArch::DBAR)).addImm(hint);
604592

605593
NextMBBI = MBB.end();
606594
MI.eraseFromParent();

llvm/lib/Target/LoongArch/LoongArchInstrInfo.td

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1792,7 +1792,7 @@ def PseudoMaskedAtomicLoadMin32 : PseudoMaskedAMMinMax;
17921792

17931793
class PseudoCmpXchg
17941794
: Pseudo<(outs GPR:$res, GPR:$scratch),
1795-
(ins GPR:$addr, GPR:$cmpval, GPR:$newval)> {
1795+
(ins GPR:$addr, GPR:$cmpval, GPR:$newval, grlenimm:$ordering)> {
17961796
let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
17971797
let mayLoad = 1;
17981798
let mayStore = 1;
@@ -1882,14 +1882,28 @@ def : AtomicPat<int_loongarch_masked_atomicrmw_umax_i64,
18821882
def : AtomicPat<int_loongarch_masked_atomicrmw_umin_i64,
18831883
PseudoMaskedAtomicLoadUMin32>;
18841884

1885-
def : Pat<(atomic_cmp_swap_64 GPR:$addr, GPR:$cmp, GPR:$new),
1886-
(PseudoCmpXchg64 GPR:$addr, GPR:$cmp, GPR:$new)>;
1885+
// Ordering constants must be kept in sync with the AtomicOrdering enum in
1886+
// AtomicOrdering.h.
1887+
multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst,
1888+
ValueType vt = GRLenVT> {
1889+
def : Pat<(vt (!cast<PatFrag>(Op#"_monotonic") GPR:$addr, GPR:$cmp, GPR:$new)),
1890+
(CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 2)>;
1891+
def : Pat<(vt (!cast<PatFrag>(Op#"_acquire") GPR:$addr, GPR:$cmp, GPR:$new)),
1892+
(CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 4)>;
1893+
def : Pat<(vt (!cast<PatFrag>(Op#"_release") GPR:$addr, GPR:$cmp, GPR:$new)),
1894+
(CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 5)>;
1895+
def : Pat<(vt (!cast<PatFrag>(Op#"_acq_rel") GPR:$addr, GPR:$cmp, GPR:$new)),
1896+
(CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 6)>;
1897+
def : Pat<(vt (!cast<PatFrag>(Op#"_seq_cst") GPR:$addr, GPR:$cmp, GPR:$new)),
1898+
(CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>;
1899+
}
1900+
1901+
defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>;
1902+
defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64, i64>;
18871903
def : Pat<(int_loongarch_masked_cmpxchg_i64
18881904
GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering),
18891905
(PseudoMaskedCmpXchg32
18901906
GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>;
1891-
def : Pat<(atomic_cmp_swap_32 GPR:$addr, GPR:$cmp, GPR:$new),
1892-
(PseudoCmpXchg32 GPR:$addr, GPR:$cmp, GPR:$new)>;
18931907

18941908
def : PseudoMaskedAMMinMaxPat<int_loongarch_masked_atomicrmw_max_i64,
18951909
PseudoMaskedAtomicLoadMax32>;

llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -33,14 +33,13 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
3333
; LA64-NEXT: bne $a5, $a2, .LBB0_5
3434
; LA64-NEXT: # %bb.4: # %atomicrmw.start
3535
; LA64-NEXT: # in Loop: Header=BB0_3 Depth=2
36-
; LA64-NEXT: dbar 0
3736
; LA64-NEXT: move $a7, $a6
3837
; LA64-NEXT: sc.w $a7, $a0, 0
3938
; LA64-NEXT: beqz $a7, .LBB0_3
4039
; LA64-NEXT: b .LBB0_6
4140
; LA64-NEXT: .LBB0_5: # %atomicrmw.start
4241
; LA64-NEXT: # in Loop: Header=BB0_1 Depth=1
43-
; LA64-NEXT: dbar 1792
42+
; LA64-NEXT: dbar 0
4443
; LA64-NEXT: .LBB0_6: # %atomicrmw.start
4544
; LA64-NEXT: # in Loop: Header=BB0_1 Depth=1
4645
; LA64-NEXT: addi.w $a6, $a2, 0
@@ -86,14 +85,13 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
8685
; LA64-NEXT: bne $a5, $a2, .LBB1_5
8786
; LA64-NEXT: # %bb.4: # %atomicrmw.start
8887
; LA64-NEXT: # in Loop: Header=BB1_3 Depth=2
89-
; LA64-NEXT: dbar 0
9088
; LA64-NEXT: move $a7, $a6
9189
; LA64-NEXT: sc.w $a7, $a0, 0
9290
; LA64-NEXT: beqz $a7, .LBB1_3
9391
; LA64-NEXT: b .LBB1_6
9492
; LA64-NEXT: .LBB1_5: # %atomicrmw.start
9593
; LA64-NEXT: # in Loop: Header=BB1_1 Depth=1
96-
; LA64-NEXT: dbar 1792
94+
; LA64-NEXT: dbar 0
9795
; LA64-NEXT: .LBB1_6: # %atomicrmw.start
9896
; LA64-NEXT: # in Loop: Header=BB1_1 Depth=1
9997
; LA64-NEXT: addi.w $a6, $a2, 0
@@ -127,14 +125,13 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
127125
; LA64-NEXT: bne $a1, $a3, .LBB2_5
128126
; LA64-NEXT: # %bb.4: # %atomicrmw.start
129127
; LA64-NEXT: # in Loop: Header=BB2_3 Depth=2
130-
; LA64-NEXT: dbar 0
131128
; LA64-NEXT: move $a6, $a5
132129
; LA64-NEXT: sc.w $a6, $a0, 0
133130
; LA64-NEXT: beqz $a6, .LBB2_3
134131
; LA64-NEXT: b .LBB2_6
135132
; LA64-NEXT: .LBB2_5: # %atomicrmw.start
136133
; LA64-NEXT: # in Loop: Header=BB2_1 Depth=1
137-
; LA64-NEXT: dbar 1792
134+
; LA64-NEXT: dbar 0
138135
; LA64-NEXT: .LBB2_6: # %atomicrmw.start
139136
; LA64-NEXT: # in Loop: Header=BB2_1 Depth=1
140137
; LA64-NEXT: move $a3, $a1
@@ -166,14 +163,13 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
166163
; LA64-NEXT: bne $a2, $a3, .LBB3_5
167164
; LA64-NEXT: # %bb.4: # %atomicrmw.start
168165
; LA64-NEXT: # in Loop: Header=BB3_3 Depth=2
169-
; LA64-NEXT: dbar 0
170166
; LA64-NEXT: move $a5, $a4
171167
; LA64-NEXT: sc.d $a5, $a0, 0
172168
; LA64-NEXT: beqz $a5, .LBB3_3
173169
; LA64-NEXT: b .LBB3_6
174170
; LA64-NEXT: .LBB3_5: # %atomicrmw.start
175171
; LA64-NEXT: # in Loop: Header=BB3_1 Depth=1
176-
; LA64-NEXT: dbar 1792
172+
; LA64-NEXT: dbar 0
177173
; LA64-NEXT: .LBB3_6: # %atomicrmw.start
178174
; LA64-NEXT: # in Loop: Header=BB3_1 Depth=1
179175
; LA64-NEXT: bne $a2, $a3, .LBB3_1
@@ -221,14 +217,13 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
221217
; LA64-NEXT: bne $a6, $a2, .LBB4_5
222218
; LA64-NEXT: # %bb.4: # %atomicrmw.start
223219
; LA64-NEXT: # in Loop: Header=BB4_3 Depth=2
224-
; LA64-NEXT: dbar 0
225220
; LA64-NEXT: move $t0, $a7
226221
; LA64-NEXT: sc.w $t0, $a0, 0
227222
; LA64-NEXT: beqz $t0, .LBB4_3
228223
; LA64-NEXT: b .LBB4_6
229224
; LA64-NEXT: .LBB4_5: # %atomicrmw.start
230225
; LA64-NEXT: # in Loop: Header=BB4_1 Depth=1
231-
; LA64-NEXT: dbar 1792
226+
; LA64-NEXT: dbar 0
232227
; LA64-NEXT: .LBB4_6: # %atomicrmw.start
233228
; LA64-NEXT: # in Loop: Header=BB4_1 Depth=1
234229
; LA64-NEXT: addi.w $a7, $a2, 0
@@ -279,14 +274,13 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
279274
; LA64-NEXT: bne $a6, $a2, .LBB5_5
280275
; LA64-NEXT: # %bb.4: # %atomicrmw.start
281276
; LA64-NEXT: # in Loop: Header=BB5_3 Depth=2
282-
; LA64-NEXT: dbar 0
283277
; LA64-NEXT: move $t0, $a7
284278
; LA64-NEXT: sc.w $t0, $a0, 0
285279
; LA64-NEXT: beqz $t0, .LBB5_3
286280
; LA64-NEXT: b .LBB5_6
287281
; LA64-NEXT: .LBB5_5: # %atomicrmw.start
288282
; LA64-NEXT: # in Loop: Header=BB5_1 Depth=1
289-
; LA64-NEXT: dbar 1792
283+
; LA64-NEXT: dbar 0
290284
; LA64-NEXT: .LBB5_6: # %atomicrmw.start
291285
; LA64-NEXT: # in Loop: Header=BB5_1 Depth=1
292286
; LA64-NEXT: addi.w $a7, $a2, 0
@@ -325,14 +319,13 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
325319
; LA64-NEXT: bne $a2, $a4, .LBB6_5
326320
; LA64-NEXT: # %bb.4: # %atomicrmw.start
327321
; LA64-NEXT: # in Loop: Header=BB6_3 Depth=2
328-
; LA64-NEXT: dbar 0
329322
; LA64-NEXT: move $a7, $a6
330323
; LA64-NEXT: sc.w $a7, $a0, 0
331324
; LA64-NEXT: beqz $a7, .LBB6_3
332325
; LA64-NEXT: b .LBB6_6
333326
; LA64-NEXT: .LBB6_5: # %atomicrmw.start
334327
; LA64-NEXT: # in Loop: Header=BB6_1 Depth=1
335-
; LA64-NEXT: dbar 1792
328+
; LA64-NEXT: dbar 0
336329
; LA64-NEXT: .LBB6_6: # %atomicrmw.start
337330
; LA64-NEXT: # in Loop: Header=BB6_1 Depth=1
338331
; LA64-NEXT: move $a4, $a2
@@ -369,14 +362,13 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
369362
; LA64-NEXT: bne $a2, $a3, .LBB7_5
370363
; LA64-NEXT: # %bb.4: # %atomicrmw.start
371364
; LA64-NEXT: # in Loop: Header=BB7_3 Depth=2
372-
; LA64-NEXT: dbar 0
373365
; LA64-NEXT: move $a5, $a4
374366
; LA64-NEXT: sc.d $a5, $a0, 0
375367
; LA64-NEXT: beqz $a5, .LBB7_3
376368
; LA64-NEXT: b .LBB7_6
377369
; LA64-NEXT: .LBB7_5: # %atomicrmw.start
378370
; LA64-NEXT: # in Loop: Header=BB7_1 Depth=1
379-
; LA64-NEXT: dbar 1792
371+
; LA64-NEXT: dbar 0
380372
; LA64-NEXT: .LBB7_6: # %atomicrmw.start
381373
; LA64-NEXT: # in Loop: Header=BB7_1 Depth=1
382374
; LA64-NEXT: bne $a2, $a3, .LBB7_1

0 commit comments

Comments
 (0)