-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[LoongArch] Improve codegen for atomic ops #67391
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-loongarch ChangesThis PR improves memory barriers generated by atomic operations. Memory barrier semantics of LL/SC:
Changes:
Patch is 57.35 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/67391.diff 8 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
index 51df0463e235248..fdc669035cacb0f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandAtomicPseudoInsts.cpp
@@ -153,18 +153,12 @@ static void doAtomicBinOpExpansion(const LoongArchInstrInfo *TII,
Register ScratchReg = MI.getOperand(1).getReg();
Register AddrReg = MI.getOperand(2).getReg();
Register IncrReg = MI.getOperand(3).getReg();
- AtomicOrdering Ordering =
- static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
// .loop:
- // if(Ordering != AtomicOrdering::Monotonic)
- // dbar 0
// ll.[w|d] dest, (addr)
// binop scratch, dest, val
// sc.[w|d] scratch, scratch, (addr)
// beqz scratch, loop
- if (Ordering != AtomicOrdering::Monotonic)
- BuildMI(LoopMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
BuildMI(LoopMBB, DL,
TII->get(Width == 32 ? LoongArch::LL_W : LoongArch::LL_D), DestReg)
.addReg(AddrReg)
@@ -251,12 +245,8 @@ static void doMaskedAtomicBinOpExpansion(
Register AddrReg = MI.getOperand(2).getReg();
Register IncrReg = MI.getOperand(3).getReg();
Register MaskReg = MI.getOperand(4).getReg();
- AtomicOrdering Ordering =
- static_cast<AtomicOrdering>(MI.getOperand(5).getImm());
// .loop:
- // if(Ordering != AtomicOrdering::Monotonic)
- // dbar 0
// ll.w destreg, (alignedaddr)
// binop scratch, destreg, incr
// xor scratch, destreg, scratch
@@ -264,8 +254,6 @@ static void doMaskedAtomicBinOpExpansion(
// xor scratch, destreg, scratch
// sc.w scratch, scratch, (alignedaddr)
// beqz scratch, loop
- if (Ordering != AtomicOrdering::Monotonic)
- BuildMI(LoopMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
BuildMI(LoopMBB, DL, TII->get(LoongArch::LL_W), DestReg)
.addReg(AddrReg)
.addImm(0);
@@ -372,23 +360,20 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
- auto TailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
// Insert new MBBs.
MF->insert(++MBB.getIterator(), LoopHeadMBB);
MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
- MF->insert(++LoopTailMBB->getIterator(), TailMBB);
- MF->insert(++TailMBB->getIterator(), DoneMBB);
+ MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
// Set up successors and transfer remaining instructions to DoneMBB.
LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
LoopHeadMBB->addSuccessor(LoopTailMBB);
LoopIfBodyMBB->addSuccessor(LoopTailMBB);
LoopTailMBB->addSuccessor(LoopHeadMBB);
- LoopTailMBB->addSuccessor(TailMBB);
- TailMBB->addSuccessor(DoneMBB);
+ LoopTailMBB->addSuccessor(DoneMBB);
DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
DoneMBB->transferSuccessors(&MBB);
MBB.addSuccessor(LoopHeadMBB);
@@ -402,11 +387,9 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
//
// .loophead:
- // dbar 0
// ll.w destreg, (alignedaddr)
// and scratch2, destreg, mask
// move scratch1, destreg
- BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
BuildMI(LoopHeadMBB, DL, TII->get(LoongArch::LL_W), DestReg)
.addReg(AddrReg)
.addImm(0);
@@ -463,7 +446,6 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
// .looptail:
// sc.w scratch1, scratch1, (addr)
// beqz scratch1, loop
- // dbar 0x700
BuildMI(LoopTailMBB, DL, TII->get(LoongArch::SC_W), Scratch1Reg)
.addReg(Scratch1Reg)
.addReg(AddrReg)
@@ -472,10 +454,6 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
.addReg(Scratch1Reg)
.addMBB(LoopHeadMBB);
- // .tail:
- // dbar 0x700
- BuildMI(TailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0x700);
-
NextMBBI = MBB.end();
MI.eraseFromParent();
@@ -483,7 +461,6 @@ bool LoongArchExpandAtomicPseudo::expandAtomicMinMaxOp(
computeAndAddLiveIns(LiveRegs, *LoopHeadMBB);
computeAndAddLiveIns(LiveRegs, *LoopIfBodyMBB);
computeAndAddLiveIns(LiveRegs, *LoopTailMBB);
- computeAndAddLiveIns(LiveRegs, *TailMBB);
computeAndAddLiveIns(LiveRegs, *DoneMBB);
return true;
@@ -535,12 +512,10 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
.addReg(CmpValReg)
.addMBB(TailMBB);
// .looptail:
- // dbar 0
// move scratch, newval
// sc.[w|d] scratch, scratch, (addr)
// beqz scratch, loophead
// b done
- BuildMI(LoopTailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
BuildMI(LoopTailMBB, DL, TII->get(LoongArch::OR), ScratchReg)
.addReg(NewValReg)
.addReg(LoongArch::R0);
@@ -573,13 +548,11 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
.addMBB(TailMBB);
// .looptail:
- // dbar 0
// andn scratch, dest, mask
// or scratch, scratch, newval
// sc.[w|d] scratch, scratch, (addr)
// beqz scratch, loophead
// b done
- BuildMI(LoopTailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0);
BuildMI(LoopTailMBB, DL, TII->get(LoongArch::ANDN), ScratchReg)
.addReg(DestReg)
.addReg(MaskReg);
@@ -598,9 +571,24 @@ bool LoongArchExpandAtomicPseudo::expandAtomicCmpXchg(
BuildMI(LoopTailMBB, DL, TII->get(LoongArch::B)).addMBB(DoneMBB);
}
+ AtomicOrdering Ordering =
+ static_cast<AtomicOrdering>(MI.getOperand(IsMasked ? 6 : 5).getImm());
+ int hint;
+
+ switch (Ordering) {
+ case AtomicOrdering::Acquire:
+ case AtomicOrdering::AcquireRelease:
+ case AtomicOrdering::SequentiallyConsistent:
+ // acquire
+ hint = 0;
+ break;
+ default:
+ hint = 0x700;
+ }
+
// .tail:
- // dbar 0x700
- BuildMI(TailMBB, DL, TII->get(LoongArch::DBAR)).addImm(0x700);
+ // dbar 0x700 | 0
+ BuildMI(TailMBB, DL, TII->get(LoongArch::DBAR)).addImm(hint);
NextMBBI = MBB.end();
MI.eraseFromParent();
diff --git a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
index 5d4ed46025d0578..8d1b018995edaca 100644
--- a/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchInstrInfo.td
@@ -1792,7 +1792,7 @@ def PseudoMaskedAtomicLoadMin32 : PseudoMaskedAMMinMax;
class PseudoCmpXchg
: Pseudo<(outs GPR:$res, GPR:$scratch),
- (ins GPR:$addr, GPR:$cmpval, GPR:$newval)> {
+ (ins GPR:$addr, GPR:$cmpval, GPR:$newval, grlenimm:$ordering)> {
let Constraints = "@earlyclobber $res,@earlyclobber $scratch";
let mayLoad = 1;
let mayStore = 1;
@@ -1882,14 +1882,28 @@ def : AtomicPat<int_loongarch_masked_atomicrmw_umax_i64,
def : AtomicPat<int_loongarch_masked_atomicrmw_umin_i64,
PseudoMaskedAtomicLoadUMin32>;
-def : Pat<(atomic_cmp_swap_64 GPR:$addr, GPR:$cmp, GPR:$new),
- (PseudoCmpXchg64 GPR:$addr, GPR:$cmp, GPR:$new)>;
+// Ordering constants must be kept in sync with the AtomicOrdering enum in
+// AtomicOrdering.h.
+multiclass PseudoCmpXchgPat<string Op, Pseudo CmpXchgInst,
+ ValueType vt = GRLenVT> {
+ def : Pat<(vt (!cast<PatFrag>(Op#"_monotonic") GPR:$addr, GPR:$cmp, GPR:$new)),
+ (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 2)>;
+ def : Pat<(vt (!cast<PatFrag>(Op#"_acquire") GPR:$addr, GPR:$cmp, GPR:$new)),
+ (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 4)>;
+ def : Pat<(vt (!cast<PatFrag>(Op#"_release") GPR:$addr, GPR:$cmp, GPR:$new)),
+ (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 5)>;
+ def : Pat<(vt (!cast<PatFrag>(Op#"_acq_rel") GPR:$addr, GPR:$cmp, GPR:$new)),
+ (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 6)>;
+ def : Pat<(vt (!cast<PatFrag>(Op#"_seq_cst") GPR:$addr, GPR:$cmp, GPR:$new)),
+ (CmpXchgInst GPR:$addr, GPR:$cmp, GPR:$new, 7)>;
+}
+
+defm : PseudoCmpXchgPat<"atomic_cmp_swap_32", PseudoCmpXchg32>;
+defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64, i64>;
def : Pat<(int_loongarch_masked_cmpxchg_i64
GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering),
(PseudoMaskedCmpXchg32
GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>;
-def : Pat<(atomic_cmp_swap_32 GPR:$addr, GPR:$cmp, GPR:$new),
- (PseudoCmpXchg32 GPR:$addr, GPR:$cmp, GPR:$new)>;
def : PseudoMaskedAMMinMaxPat<int_loongarch_masked_atomicrmw_max_i64,
PseudoMaskedAtomicLoadMax32>;
diff --git a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
index fba340bed422245..d8b0fc1e095b710 100644
--- a/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
+++ b/llvm/test/CodeGen/LoongArch/atomicrmw-uinc-udec-wrap.ll
@@ -33,14 +33,13 @@ define i8 @atomicrmw_uinc_wrap_i8(ptr %ptr, i8 %val) {
; LA64-NEXT: bne $a5, $a2, .LBB0_5
; LA64-NEXT: # %bb.4: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB0_3 Depth=2
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $a7, $a6
; LA64-NEXT: sc.w $a7, $a0, 0
; LA64-NEXT: beqz $a7, .LBB0_3
; LA64-NEXT: b .LBB0_6
; LA64-NEXT: .LBB0_5: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB0_1 Depth=1
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB0_6: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB0_1 Depth=1
; LA64-NEXT: addi.w $a6, $a2, 0
@@ -86,14 +85,13 @@ define i16 @atomicrmw_uinc_wrap_i16(ptr %ptr, i16 %val) {
; LA64-NEXT: bne $a5, $a2, .LBB1_5
; LA64-NEXT: # %bb.4: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB1_3 Depth=2
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $a7, $a6
; LA64-NEXT: sc.w $a7, $a0, 0
; LA64-NEXT: beqz $a7, .LBB1_3
; LA64-NEXT: b .LBB1_6
; LA64-NEXT: .LBB1_5: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB1_1 Depth=1
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB1_6: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB1_1 Depth=1
; LA64-NEXT: addi.w $a6, $a2, 0
@@ -127,14 +125,13 @@ define i32 @atomicrmw_uinc_wrap_i32(ptr %ptr, i32 %val) {
; LA64-NEXT: bne $a1, $a3, .LBB2_5
; LA64-NEXT: # %bb.4: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB2_3 Depth=2
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $a6, $a5
; LA64-NEXT: sc.w $a6, $a0, 0
; LA64-NEXT: beqz $a6, .LBB2_3
; LA64-NEXT: b .LBB2_6
; LA64-NEXT: .LBB2_5: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB2_1 Depth=1
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB2_6: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB2_1 Depth=1
; LA64-NEXT: move $a3, $a1
@@ -166,14 +163,13 @@ define i64 @atomicrmw_uinc_wrap_i64(ptr %ptr, i64 %val) {
; LA64-NEXT: bne $a2, $a3, .LBB3_5
; LA64-NEXT: # %bb.4: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB3_3 Depth=2
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $a5, $a4
; LA64-NEXT: sc.d $a5, $a0, 0
; LA64-NEXT: beqz $a5, .LBB3_3
; LA64-NEXT: b .LBB3_6
; LA64-NEXT: .LBB3_5: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB3_1 Depth=1
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB3_6: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB3_1 Depth=1
; LA64-NEXT: bne $a2, $a3, .LBB3_1
@@ -221,14 +217,13 @@ define i8 @atomicrmw_udec_wrap_i8(ptr %ptr, i8 %val) {
; LA64-NEXT: bne $a6, $a2, .LBB4_5
; LA64-NEXT: # %bb.4: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB4_3 Depth=2
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $t0, $a7
; LA64-NEXT: sc.w $t0, $a0, 0
; LA64-NEXT: beqz $t0, .LBB4_3
; LA64-NEXT: b .LBB4_6
; LA64-NEXT: .LBB4_5: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB4_1 Depth=1
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB4_6: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB4_1 Depth=1
; LA64-NEXT: addi.w $a7, $a2, 0
@@ -279,14 +274,13 @@ define i16 @atomicrmw_udec_wrap_i16(ptr %ptr, i16 %val) {
; LA64-NEXT: bne $a6, $a2, .LBB5_5
; LA64-NEXT: # %bb.4: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB5_3 Depth=2
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $t0, $a7
; LA64-NEXT: sc.w $t0, $a0, 0
; LA64-NEXT: beqz $t0, .LBB5_3
; LA64-NEXT: b .LBB5_6
; LA64-NEXT: .LBB5_5: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB5_1 Depth=1
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB5_6: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB5_1 Depth=1
; LA64-NEXT: addi.w $a7, $a2, 0
@@ -325,14 +319,13 @@ define i32 @atomicrmw_udec_wrap_i32(ptr %ptr, i32 %val) {
; LA64-NEXT: bne $a2, $a4, .LBB6_5
; LA64-NEXT: # %bb.4: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB6_3 Depth=2
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $a7, $a6
; LA64-NEXT: sc.w $a7, $a0, 0
; LA64-NEXT: beqz $a7, .LBB6_3
; LA64-NEXT: b .LBB6_6
; LA64-NEXT: .LBB6_5: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB6_1 Depth=1
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB6_6: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB6_1 Depth=1
; LA64-NEXT: move $a4, $a2
@@ -369,14 +362,13 @@ define i64 @atomicrmw_udec_wrap_i64(ptr %ptr, i64 %val) {
; LA64-NEXT: bne $a2, $a3, .LBB7_5
; LA64-NEXT: # %bb.4: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB7_3 Depth=2
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $a5, $a4
; LA64-NEXT: sc.d $a5, $a0, 0
; LA64-NEXT: beqz $a5, .LBB7_3
; LA64-NEXT: b .LBB7_6
; LA64-NEXT: .LBB7_5: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB7_1 Depth=1
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB7_6: # %atomicrmw.start
; LA64-NEXT: # in Loop: Header=BB7_1 Depth=1
; LA64-NEXT: bne $a2, $a3, .LBB7_1
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
index 4e458e989c27e50..817bafcf0e62285 100644
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomic-cmpxchg.ll
@@ -20,14 +20,13 @@ define void @cmpxchg_i8_acquire_acquire(ptr %ptr, i8 %cmp, i8 %val) nounwind {
; LA64-NEXT: and $a5, $a4, $a3
; LA64-NEXT: bne $a5, $a1, .LBB0_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1
-; LA64-NEXT: dbar 0
; LA64-NEXT: andn $a5, $a4, $a3
; LA64-NEXT: or $a5, $a5, $a2
; LA64-NEXT: sc.w $a5, $a0, 0
; LA64-NEXT: beqz $a5, .LBB0_1
; LA64-NEXT: b .LBB0_4
; LA64-NEXT: .LBB0_3:
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB0_4:
; LA64-NEXT: ret
%res = cmpxchg ptr %ptr, i8 %cmp, i8 %val acquire acquire
@@ -54,14 +53,13 @@ define void @cmpxchg_i16_acquire_acquire(ptr %ptr, i16 %cmp, i16 %val) nounwind
; LA64-NEXT: and $a5, $a4, $a3
; LA64-NEXT: bne $a5, $a1, .LBB1_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1
-; LA64-NEXT: dbar 0
; LA64-NEXT: andn $a5, $a4, $a3
; LA64-NEXT: or $a5, $a5, $a2
; LA64-NEXT: sc.w $a5, $a0, 0
; LA64-NEXT: beqz $a5, .LBB1_1
; LA64-NEXT: b .LBB1_4
; LA64-NEXT: .LBB1_3:
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB1_4:
; LA64-NEXT: ret
%res = cmpxchg ptr %ptr, i16 %cmp, i16 %val acquire acquire
@@ -75,13 +73,12 @@ define void @cmpxchg_i32_acquire_acquire(ptr %ptr, i32 %cmp, i32 %val) nounwind
; LA64-NEXT: ll.w $a3, $a0, 0
; LA64-NEXT: bne $a3, $a1, .LBB2_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB2_1 Depth=1
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $a4, $a2
; LA64-NEXT: sc.w $a4, $a0, 0
; LA64-NEXT: beqz $a4, .LBB2_1
; LA64-NEXT: b .LBB2_4
; LA64-NEXT: .LBB2_3:
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB2_4:
; LA64-NEXT: ret
%res = cmpxchg ptr %ptr, i32 %cmp, i32 %val acquire acquire
@@ -95,13 +92,12 @@ define void @cmpxchg_i64_acquire_acquire(ptr %ptr, i64 %cmp, i64 %val) nounwind
; LA64-NEXT: ll.d $a3, $a0, 0
; LA64-NEXT: bne $a3, $a1, .LBB3_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $a4, $a2
; LA64-NEXT: sc.d $a4, $a0, 0
; LA64-NEXT: beqz $a4, .LBB3_1
; LA64-NEXT: b .LBB3_4
; LA64-NEXT: .LBB3_3:
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB3_4:
; LA64-NEXT: ret
%res = cmpxchg ptr %ptr, i64 %cmp, i64 %val acquire acquire
@@ -127,14 +123,13 @@ define i8 @cmpxchg_i8_acquire_acquire_reti8(ptr %ptr, i8 %cmp, i8 %val) nounwind
; LA64-NEXT: and $a6, $a5, $a4
; LA64-NEXT: bne $a6, $a1, .LBB4_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB4_1 Depth=1
-; LA64-NEXT: dbar 0
; LA64-NEXT: andn $a6, $a5, $a4
; LA64-NEXT: or $a6, $a6, $a2
; LA64-NEXT: sc.w $a6, $a0, 0
; LA64-NEXT: beqz $a6, .LBB4_1
; LA64-NEXT: b .LBB4_4
; LA64-NEXT: .LBB4_3:
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB4_4:
; LA64-NEXT: srl.w $a0, $a5, $a3
; LA64-NEXT: ret
@@ -163,14 +158,13 @@ define i16 @cmpxchg_i16_acquire_acquire_reti16(ptr %ptr, i16 %cmp, i16 %val) nou
; LA64-NEXT: and $a6, $a5, $a3
; LA64-NEXT: bne $a6, $a1, .LBB5_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB5_1 Depth=1
-; LA64-NEXT: dbar 0
; LA64-NEXT: andn $a6, $a5, $a3
; LA64-NEXT: or $a6, $a6, $a2
; LA64-NEXT: sc.w $a6, $a0, 0
; LA64-NEXT: beqz $a6, .LBB5_1
; LA64-NEXT: b .LBB5_4
; LA64-NEXT: .LBB5_3:
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB5_4:
; LA64-NEXT: srl.w $a0, $a5, $a4
; LA64-NEXT: ret
@@ -186,13 +180,12 @@ define i32 @cmpxchg_i32_acquire_acquire_reti32(ptr %ptr, i32 %cmp, i32 %val) nou
; LA64-NEXT: ll.w $a3, $a0, 0
; LA64-NEXT: bne $a3, $a1, .LBB6_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB6_1 Depth=1
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $a4, $a2
; LA64-NEXT: sc.w $a4, $a0, 0
; LA64-NEXT: beqz $a4, .LBB6_1
; LA64-NEXT: b .LBB6_4
; LA64-NEXT: .LBB6_3:
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB6_4:
; LA64-NEXT: move $a0, $a3
; LA64-NEXT: ret
@@ -208,13 +201,12 @@ define i64 @cmpxchg_i64_acquire_acquire_reti64(ptr %ptr, i64 %cmp, i64 %val) nou
; LA64-NEXT: ll.d $a3, $a0, 0
; LA64-NEXT: bne $a3, $a1, .LBB7_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB7_1 Depth=1
-; LA64-NEXT: dbar 0
; LA64-NEXT: move $a4, $a2
; LA64-NEXT: sc.d $a4, $a0, 0
; LA64-NEXT: beqz $a4, .LBB7_1
; LA64-NEXT: b .LBB7_4
; LA64-NEXT: .LBB7_3:
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB7_4:
; LA64-NEXT: move $a0, $a3
; LA64-NEXT: ret
@@ -242,14 +234,13 @@ define i1 @cmpxchg_i8_acquire_acquire_reti1(ptr %ptr, i8 %cmp, i8 %val) nounwind
; LA64-NEXT: and $a6, $a5, $a3
; LA64-NEXT: bne $a6, $a1, .LBB8_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB8_1 Depth=1
-; LA64-NEXT: dbar 0
; LA64-NEXT: andn $a6, $a5, $a3
; LA64-NEXT: or $a6, $a6, $a2
; LA64-NEXT: sc.w $a6, $a0, 0
; LA64-NEXT: beqz $a6, .LBB8_1
; LA64-NEXT: b .LBB8_4
; LA64-NEXT: .LBB8_3:
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB8_4:
; LA64-NEXT: and $a0, $a5, $a4
; LA64-NEXT: addi.w $a0, $a0, 0
@@ -281,14 +272,13 @@ define i1 @cmpxchg_i16_acquire_acquire_reti1(ptr %ptr, i16 %cmp, i16 %val) nounw
; LA64-NEXT: and $a6, $a5, $a4
; LA64-NEXT: bne $a6, $a1, .LBB9_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB9_1 Depth=1
-; LA64-NEXT: dbar 0
; LA64-NEXT: andn $a6, $a5, $a4
; LA64-NEXT: or $a6, $a6, $a2
; LA64-NEXT: sc.w $a6, $a0, 0
; LA64-NEXT: beqz $a6, .LBB9_1
; LA64-NEXT: b .LBB9_4
; LA64-NEXT: .LBB9_3:
-; LA64-NEXT: dbar 1792
+; LA64-NEXT: dbar 0
; LA64-NEXT: .LBB9_4:
; LA64-NEXT: and $a0, $a5, $a3
; LA64-NEXT: addi.w $a0, $a0, 0
@@ -307,13 +297,12 @@ define i1 @cmpxchg_i32_acquire_acquire_reti1(ptr %ptr, i32 %cmp, i32 %val) nounw
; LA64-NEXT: ll.w $a3, $a0, 0
; LA64-NEXT: bne $a3, $a1, .LBB10_3
; LA64-NEXT: # %bb.2: # in Loop: Header=BB10_1 Dep...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
A test case for cmpxchg acquire. Without this patch, the unexpected behavior can be reproduced on the 3A6000. This can also be reproduced with the GCC compiler. Source: https://gist.github.com/heiher/76a42a083df9359c01be0c573ab17734 /*
============================================================================
Name : cmpxchg-acquire-test.c
Author : WANG Rui <[email protected]>
Copyright : Copyright (c) 2023 Loongson
Description : A test case for cmpxchg acquire (3A6000)
============================================================================
*/
#include <stdio.h>
#include <stdbool.h>
#include <stdatomic.h>
#include <pthread.h>
static unsigned int tags[32];
static unsigned int vals[32];
static void *
writer_entry (void *data)
{
atomic_uint *pt = (atomic_uint *)tags;
atomic_uint *pv = (atomic_uint *)vals;
unsigned int n = 1;
for (; n;) {
atomic_store_explicit (&pv[n & 31], n, memory_order_release);
atomic_store_explicit (&pt[n & 31], n, memory_order_release);
n++;
}
return NULL;
}
static void *
reader_entry (void *data)
{
atomic_uint *pt = (atomic_uint *)tags;
atomic_uint *pv = (atomic_uint *)vals;
int i;
for (;;) {
for (i = 0; i < 32; i++) {
unsigned int tag = 0;
bool res;
res = atomic_compare_exchange_weak_explicit (
&pt[i], &tag, 0, memory_order_acquire, memory_order_acquire);
if (!res) {
unsigned int val;
val = atomic_load_explicit (&pv[i], memory_order_relaxed);
if (val < tag)
printf ("UNEXPECTED: val(%u) < tag(%u)\n", val, tag);
}
}
}
return NULL;
}
int
main (int argc, char *argv[])
{
pthread_t writer;
pthread_t reader;
int res;
res = pthread_create (&writer, NULL, writer_entry, NULL);
if (res < 0)
return -1;
res = pthread_create (&reader, NULL, reader_entry, NULL);
if (res < 0)
return -1;
res = pthread_join (writer, NULL);
if (res < 0)
return -1;
printf ("PASSED\n");
return 0;
} |
eaf3baf
to
17743f2
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
It's better to create 2 seperate PRs for each commit. Refer: https://llvm.org/docs/GitHub.html#creating-pull-requests |
I agree with you. |
PR llvm#67391 improved atomic codegen by handling memory ordering specified by the `cmpxchg` instruction. An acquire barrier needs to be generated when memory ordering includes an acquire operation. This PR improves the codegen further by only handling the failure ordering.
PR #67391 improved atomic codegen by handling memory ordering specified by the `cmpxchg` instruction. An acquire barrier needs to be generated when memory ordering includes an acquire operation. This PR improves the codegen further by only handling the failure ordering.
This is isomorphic to the LLVM changes [1-2]. On LoongArch, the LL and SC instructions has memory barrier semantics: - LL: <memory-barrier> + <load-exclusive> - SC: <store-conditional> + <memory-barrier> But the compare and swap operation is allowed to fail, and if it fails the SC instruction is not executed, thus the guarantee of acquiring semantics cannot be ensured. Therefore, an acquire barrier needs to be generated when failure_memorder includes an acquire operation. On CPUs implementing LoongArch v1.10 or later, "dbar 0b10100" is an acquire barrier; on CPUs implementing LoongArch v1.00, it is a full barrier. So it's always enough for acquire semantics. OTOH if an acquire semantic is not needed, we still needs the "dbar 0x700" as the load-load barrier like all LL-SC loops. [1]:llvm/llvm-project#67391 [2]:llvm/llvm-project#69339 gcc/ChangeLog: * config/loongarch/loongarch.cc (loongarch_memmodel_needs_release_fence): Remove. (loongarch_cas_failure_memorder_needs_acquire): New static function. (loongarch_print_operand): Redefine 'G' for the barrier on CAS failure. * config/loongarch/sync.md (atomic_cas_value_strong<mode>): Remove the redundant barrier before the LL instruction, and emit an acquire barrier on failure if needed by failure_memorder. (atomic_cas_value_cmp_and_7_<mode>): Likewise. (atomic_cas_value_add_7_<mode>): Remove the unnecessary barrier before the LL instruction. (atomic_cas_value_sub_7_<mode>): Likewise. (atomic_cas_value_and_7_<mode>): Likewise. (atomic_cas_value_xor_7_<mode>): Likewise. (atomic_cas_value_or_7_<mode>): Likewise. (atomic_cas_value_nand_7_<mode>): Likewise. (atomic_cas_value_exchange_7_<mode>): Likewise. gcc/testsuite/ChangeLog: * gcc.target/loongarch/cas-acquire.c: New test.
This is isomorphic to the LLVM changes [1-2]. On LoongArch, the LL and SC instructions has memory barrier semantics: - LL: <memory-barrier> + <load-exclusive> - SC: <store-conditional> + <memory-barrier> But the compare and swap operation is allowed to fail, and if it fails the SC instruction is not executed, thus the guarantee of acquiring semantics cannot be ensured. Therefore, an acquire barrier needs to be generated when failure_memorder includes an acquire operation. On CPUs implementing LoongArch v1.10 or later, "dbar 0b10100" is an acquire barrier; on CPUs implementing LoongArch v1.00, it is a full barrier. So it's always enough for acquire semantics. OTOH if an acquire semantic is not needed, we still needs the "dbar 0x700" as the load-load barrier like all LL-SC loops. [1]:llvm/llvm-project#67391 [2]:llvm/llvm-project#69339 gcc/ChangeLog: * config/loongarch/loongarch.cc (loongarch_memmodel_needs_release_fence): Remove. (loongarch_cas_failure_memorder_needs_acquire): New static function. (loongarch_print_operand): Redefine 'G' for the barrier on CAS failure. * config/loongarch/sync.md (atomic_cas_value_strong<mode>): Remove the redundant barrier before the LL instruction, and emit an acquire barrier on failure if needed by failure_memorder. (atomic_cas_value_cmp_and_7_<mode>): Likewise. (atomic_cas_value_add_7_<mode>): Remove the unnecessary barrier before the LL instruction. (atomic_cas_value_sub_7_<mode>): Likewise. (atomic_cas_value_and_7_<mode>): Likewise. (atomic_cas_value_xor_7_<mode>): Likewise. (atomic_cas_value_or_7_<mode>): Likewise. (atomic_cas_value_nand_7_<mode>): Likewise. (atomic_cas_value_exchange_7_<mode>): Likewise. gcc/testsuite/ChangeLog: * gcc.target/loongarch/cas-acquire.c: New test.
This is isomorphic to the LLVM changes [1-2]. On LoongArch, the LL and SC instructions has memory barrier semantics: - LL: <memory-barrier> + <load-exclusive> - SC: <store-conditional> + <memory-barrier> But the compare and swap operation is allowed to fail, and if it fails the SC instruction is not executed, thus the guarantee of acquiring semantics cannot be ensured. Therefore, an acquire barrier needs to be generated when failure_memorder includes an acquire operation. On CPUs implementing LoongArch v1.10 or later, "dbar 0b10100" is an acquire barrier; on CPUs implementing LoongArch v1.00, it is a full barrier. So it's always enough for acquire semantics. OTOH if an acquire semantic is not needed, we still needs the "dbar 0x700" as the load-load barrier like all LL-SC loops. [1]:llvm/llvm-project#67391 [2]:llvm/llvm-project#69339 Backported for fixing the acquire semantic issue which is known to cause troubles on LA664. gcc/ChangeLog: * config/loongarch/loongarch.cc (loongarch_memmodel_needs_release_fence): Remove. (loongarch_cas_failure_memorder_needs_acquire): New static function. (loongarch_print_operand): Redefine 'G' for the barrier on CAS failure. * config/loongarch/sync.md (atomic_cas_value_strong<mode>): Remove the redundant barrier before the LL instruction, and emit an acquire barrier on failure if needed by failure_memorder. (atomic_cas_value_cmp_and_7_<mode>): Likewise. (atomic_cas_value_add_7_<mode>): Remove the unnecessary barrier before the LL instruction. (atomic_cas_value_sub_7_<mode>): Likewise. (atomic_cas_value_and_7_<mode>): Likewise. (atomic_cas_value_xor_7_<mode>): Likewise. (atomic_cas_value_or_7_<mode>): Likewise. (atomic_cas_value_nand_7_<mode>): Likewise. (atomic_cas_value_exchange_7_<mode>): Likewise. gcc/testsuite/ChangeLog: * gcc.target/loongarch/cas-acquire.c: New test. (cherry picked from commit 4d86dc5)
This is isomorphic to the LLVM changes [1-2]. On LoongArch, the LL and SC instructions has memory barrier semantics: - LL: <memory-barrier> + <load-exclusive> - SC: <store-conditional> + <memory-barrier> But the compare and swap operation is allowed to fail, and if it fails the SC instruction is not executed, thus the guarantee of acquiring semantics cannot be ensured. Therefore, an acquire barrier needs to be generated when failure_memorder includes an acquire operation. On CPUs implementing LoongArch v1.10 or later, "dbar 0b10100" is an acquire barrier; on CPUs implementing LoongArch v1.00, it is a full barrier. So it's always enough for acquire semantics. OTOH if an acquire semantic is not needed, we still needs the "dbar 0x700" as the load-load barrier like all LL-SC loops. [1]:llvm/llvm-project#67391 [2]:llvm/llvm-project#69339 gcc/ChangeLog: * config/loongarch/loongarch.cc (loongarch_memmodel_needs_release_fence): Remove. (loongarch_cas_failure_memorder_needs_acquire): New static function. (loongarch_print_operand): Redefine 'G' for the barrier on CAS failure. * config/loongarch/sync.md (atomic_cas_value_strong<mode>): Remove the redundant barrier before the LL instruction, and emit an acquire barrier on failure if needed by failure_memorder. (atomic_cas_value_cmp_and_7_<mode>): Likewise. (atomic_cas_value_add_7_<mode>): Remove the unnecessary barrier before the LL instruction. (atomic_cas_value_sub_7_<mode>): Likewise. (atomic_cas_value_and_7_<mode>): Likewise. (atomic_cas_value_xor_7_<mode>): Likewise. (atomic_cas_value_or_7_<mode>): Likewise. (atomic_cas_value_nand_7_<mode>): Likewise. (atomic_cas_value_exchange_7_<mode>): Likewise. gcc/testsuite/ChangeLog: * gcc.target/loongarch/cas-acquire.c: New test. (cherry picked from commit 4d86dc5)
This is isomorphic to the LLVM changes [1-2]. On LoongArch, the LL and SC instructions has memory barrier semantics: - LL: <memory-barrier> + <load-exclusive> - SC: <store-conditional> + <memory-barrier> But the compare and swap operation is allowed to fail, and if it fails the SC instruction is not executed, thus the guarantee of acquiring semantics cannot be ensured. Therefore, an acquire barrier needs to be generated when failure_memorder includes an acquire operation. On CPUs implementing LoongArch v1.10 or later, "dbar 0b10100" is an acquire barrier; on CPUs implementing LoongArch v1.00, it is a full barrier. So it's always enough for acquire semantics. OTOH if an acquire semantic is not needed, we still needs the "dbar 0x700" as the load-load barrier like all LL-SC loops. [1]:llvm/llvm-project#67391 [2]:llvm/llvm-project#69339 Backported for fixing the acquire semantic issue which is known to cause troubles on LA664. gcc/ChangeLog: * config/loongarch/loongarch.cc (loongarch_memmodel_needs_release_fence): Remove. (loongarch_cas_failure_memorder_needs_acquire): New static function. (loongarch_print_operand): Redefine 'G' for the barrier on CAS failure. * config/loongarch/sync.md (atomic_cas_value_strong<mode>): Remove the redundant barrier before the LL instruction, and emit an acquire barrier on failure if needed by failure_memorder. (atomic_cas_value_cmp_and_7_<mode>): Likewise. (atomic_cas_value_add_7_<mode>): Remove the unnecessary barrier before the LL instruction. (atomic_cas_value_sub_7_<mode>): Likewise. (atomic_cas_value_and_7_<mode>): Likewise. (atomic_cas_value_xor_7_<mode>): Likewise. (atomic_cas_value_or_7_<mode>): Likewise. (atomic_cas_value_nand_7_<mode>): Likewise. (atomic_cas_value_exchange_7_<mode>): Likewise. gcc/testsuite/ChangeLog: * gcc.target/loongarch/cas-acquire.c: New test. (cherry picked from commit 4d86dc5)
This is isomorphic to the LLVM changes [1-2]. On LoongArch, the LL and SC instructions has memory barrier semantics: - LL: <memory-barrier> + <load-exclusive> - SC: <store-conditional> + <memory-barrier> But the compare and swap operation is allowed to fail, and if it fails the SC instruction is not executed, thus the guarantee of acquiring semantics cannot be ensured. Therefore, an acquire barrier needs to be generated when failure_memorder includes an acquire operation. On CPUs implementing LoongArch v1.10 or later, "dbar 0b10100" is an acquire barrier; on CPUs implementing LoongArch v1.00, it is a full barrier. So it's always enough for acquire semantics. OTOH if an acquire semantic is not needed, we still needs the "dbar 0x700" as the load-load barrier like all LL-SC loops. [1]:llvm/llvm-project#67391 [2]:llvm/llvm-project#69339 gcc/ChangeLog: * config/loongarch/loongarch.cc (loongarch_memmodel_needs_release_fence): Remove. (loongarch_cas_failure_memorder_needs_acquire): New static function. (loongarch_print_operand): Redefine 'G' for the barrier on CAS failure. * config/loongarch/sync.md (atomic_cas_value_strong<mode>): Remove the redundant barrier before the LL instruction, and emit an acquire barrier on failure if needed by failure_memorder. (atomic_cas_value_cmp_and_7_<mode>): Likewise. (atomic_cas_value_add_7_<mode>): Remove the unnecessary barrier before the LL instruction. (atomic_cas_value_sub_7_<mode>): Likewise. (atomic_cas_value_and_7_<mode>): Likewise. (atomic_cas_value_xor_7_<mode>): Likewise. (atomic_cas_value_or_7_<mode>): Likewise. (atomic_cas_value_nand_7_<mode>): Likewise. (atomic_cas_value_exchange_7_<mode>): Likewise. gcc/testsuite/ChangeLog: * gcc.target/loongarch/cas-acquire.c: New test.
This PR improves memory barriers generated by atomic operations. Memory barrier semantics of LL/SC: ``` LL: <memory-barrier> + <load-exclusive> SC: <store-conditional> + <memory-barrier> ``` Changes: * Remove unnecessary memory barriers before LL and between LL/SC. * Fix acquire semantics. (If the SC instruction is not executed, then the guarantee of acquiring semantics cannot be ensured. Therefore, an acquire barrier needs to be generated when memory ordering includes an acquire operation.) (cherry picked from commit 203ba23) Change-Id: I4ef87f94e7e01ae9bd93e1e57338534131e93590
PR llvm#67391 improved atomic codegen by handling memory ordering specified by the `cmpxchg` instruction. An acquire barrier needs to be generated when memory ordering includes an acquire operation. This PR improves the codegen further by only handling the failure ordering. (cherry picked from commit 78abc45) Change-Id: I00391ad1aaf5c64ae95cc0f4f84a0b480a2fb5b3
This PR improves memory barriers generated by atomic operations.
Memory barrier semantics of LL/SC:
Changes: