Skip to content

Commit d6f9278

Browse files
authored
[X86] Use plain load/store instead of cmpxchg16b for atomics with AVX (#74275)
In late 2021, both Intel and AMD finally documented that every AVX-capable CPU has always been guaranteed to execute aligned 16-byte loads/stores atomically, and further, guaranteed that all future CPUs with AVX will do so as well. Therefore, we may use normal SSE 128-bit load/store instructions to implement atomics, if AVX is enabled. Per AMD64 Architecture Programmer's manual, 7.3.2 Access Atomicity: > Processors that report [AVX] extend the atomicity for cacheable, > naturally-aligned single loads or stores from a quadword to a double > quadword. Per Intel's SDM: > Processors that enumerate support for Intel(R) AVX guarantee that the > 16-byte memory operations performed by the following instructions will > always be carried out atomically: > - MOVAPD, MOVAPS, and MOVDQA. > - VMOVAPD, VMOVAPS, and VMOVDQA when encoded with VEX.128. > - VMOVAPD, VMOVAPS, VMOVDQA32, and VMOVDQA64 when encoded with > EVEX.128 and k0 (masking disabled). This was also confirmed to be true for Zhaoxin CPUs with AVX, in https://gcc.gnu.org/PR104688
1 parent 84abe0a commit d6f9278

File tree

7 files changed

+259
-221
lines changed

7 files changed

+259
-221
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 70 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
544544
if (!Subtarget.is64Bit())
545545
setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom);
546546

547+
if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
548+
// All CPUs supporting AVX will atomically load/store aligned 128-bit
549+
// values, so we can emit [V]MOVAPS/[V]MOVDQA.
550+
setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
551+
setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Custom);
552+
}
553+
547554
if (Subtarget.canUseCMPXCHG16B())
548555
setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
549556

@@ -30415,32 +30422,40 @@ TargetLoweringBase::AtomicExpansionKind
3041530422
X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
3041630423
Type *MemType = SI->getValueOperand()->getType();
3041730424

30418-
bool NoImplicitFloatOps =
30419-
SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30420-
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30421-
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30422-
(Subtarget.hasSSE1() || Subtarget.hasX87()))
30423-
return AtomicExpansionKind::None;
30425+
if (!SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
30426+
!Subtarget.useSoftFloat()) {
30427+
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30428+
(Subtarget.hasSSE1() || Subtarget.hasX87()))
30429+
return AtomicExpansionKind::None;
30430+
30431+
if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
30432+
Subtarget.hasAVX())
30433+
return AtomicExpansionKind::None;
30434+
}
3042430435

3042530436
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand
3042630437
: AtomicExpansionKind::None;
3042730438
}
3042830439

3042930440
// Note: this turns large loads into lock cmpxchg8b/16b.
30430-
// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
3043130441
TargetLowering::AtomicExpansionKind
3043230442
X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
3043330443
Type *MemType = LI->getType();
3043430444

30435-
// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30436-
// can use movq to do the load. If we have X87 we can load into an 80-bit
30437-
// X87 register and store it to a stack temporary.
30438-
bool NoImplicitFloatOps =
30439-
LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
30440-
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30441-
!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
30442-
(Subtarget.hasSSE1() || Subtarget.hasX87()))
30443-
return AtomicExpansionKind::None;
30445+
if (!LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat) &&
30446+
!Subtarget.useSoftFloat()) {
30447+
// If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we
30448+
// can use movq to do the load. If we have X87 we can load into an 80-bit
30449+
// X87 register and store it to a stack temporary.
30450+
if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
30451+
(Subtarget.hasSSE1() || Subtarget.hasX87()))
30452+
return AtomicExpansionKind::None;
30453+
30454+
// If this is a 128-bit load with AVX, 128-bit SSE loads/stores are atomic.
30455+
if (MemType->getPrimitiveSizeInBits() == 128 && Subtarget.is64Bit() &&
30456+
Subtarget.hasAVX())
30457+
return AtomicExpansionKind::None;
30458+
}
3044430459

3044530460
return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
3044630461
: AtomicExpansionKind::None;
@@ -31683,14 +31698,21 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
3168331698
if (!IsSeqCst && IsTypeLegal)
3168431699
return Op;
3168531700

31686-
if (VT == MVT::i64 && !IsTypeLegal) {
31701+
if (!IsTypeLegal && !Subtarget.useSoftFloat() &&
31702+
!DAG.getMachineFunction().getFunction().hasFnAttribute(
31703+
Attribute::NoImplicitFloat)) {
31704+
SDValue Chain;
31705+
// For illegal i128 atomic_store, when AVX is enabled, we can simply emit a
31706+
// vector store.
31707+
if (VT == MVT::i128 && Subtarget.is64Bit() && Subtarget.hasAVX()) {
31708+
SDValue VecVal = DAG.getBitcast(MVT::v2i64, Node->getVal());
31709+
Chain = DAG.getStore(Node->getChain(), dl, VecVal, Node->getBasePtr(),
31710+
Node->getMemOperand());
31711+
}
31712+
3168731713
// For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
3168831714
// is enabled.
31689-
bool NoImplicitFloatOps =
31690-
DAG.getMachineFunction().getFunction().hasFnAttribute(
31691-
Attribute::NoImplicitFloat);
31692-
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
31693-
SDValue Chain;
31715+
if (VT == MVT::i64) {
3169431716
if (Subtarget.hasSSE1()) {
3169531717
SDValue SclToVec =
3169631718
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Node->getVal());
@@ -31722,15 +31744,15 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
3172231744
DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
3172331745
StoreOps, MVT::i64, Node->getMemOperand());
3172431746
}
31747+
}
3172531748

31726-
if (Chain) {
31727-
// If this is a sequentially consistent store, also emit an appropriate
31728-
// barrier.
31729-
if (IsSeqCst)
31730-
Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
31749+
if (Chain) {
31750+
// If this is a sequentially consistent store, also emit an appropriate
31751+
// barrier.
31752+
if (IsSeqCst)
31753+
Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
3173131754

31732-
return Chain;
31733-
}
31755+
return Chain;
3173431756
}
3173531757
}
3173631758

@@ -33303,12 +33325,30 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
3330333325
return;
3330433326
}
3330533327
case ISD::ATOMIC_LOAD: {
33306-
assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
33328+
assert(
33329+
(N->getValueType(0) == MVT::i64 || N->getValueType(0) == MVT::i128) &&
33330+
"Unexpected VT!");
3330733331
bool NoImplicitFloatOps =
3330833332
DAG.getMachineFunction().getFunction().hasFnAttribute(
3330933333
Attribute::NoImplicitFloat);
3331033334
if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
3331133335
auto *Node = cast<AtomicSDNode>(N);
33336+
33337+
if (N->getValueType(0) == MVT::i128) {
33338+
if (Subtarget.is64Bit() && Subtarget.hasAVX()) {
33339+
SDValue Ld = DAG.getLoad(MVT::v2i64, dl, Node->getChain(),
33340+
Node->getBasePtr(), Node->getMemOperand());
33341+
SDValue ResL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33342+
DAG.getIntPtrConstant(0, dl));
33343+
SDValue ResH = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
33344+
DAG.getIntPtrConstant(1, dl));
33345+
Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, N->getValueType(0),
33346+
{ResL, ResH}));
33347+
Results.push_back(Ld.getValue(1));
33348+
return;
33349+
}
33350+
break;
33351+
}
3331233352
if (Subtarget.hasSSE1()) {
3331333353
// Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
3331433354
// Then extract the lower 64-bits.

llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll

Lines changed: 3 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -28,22 +28,7 @@ define void @store_fp128(ptr %fptr, fp128 %v) {
2828
;
2929
; X64-AVX-LABEL: store_fp128:
3030
; X64-AVX: # %bb.0:
31-
; X64-AVX-NEXT: pushq %rbx
32-
; X64-AVX-NEXT: .cfi_def_cfa_offset 16
33-
; X64-AVX-NEXT: .cfi_offset %rbx, -16
34-
; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
35-
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rbx
36-
; X64-AVX-NEXT: movq -{{[0-9]+}}(%rsp), %rcx
37-
; X64-AVX-NEXT: movq (%rdi), %rax
38-
; X64-AVX-NEXT: movq 8(%rdi), %rdx
39-
; X64-AVX-NEXT: .p2align 4, 0x90
40-
; X64-AVX-NEXT: .LBB0_1: # %atomicrmw.start
41-
; X64-AVX-NEXT: # =>This Inner Loop Header: Depth=1
42-
; X64-AVX-NEXT: lock cmpxchg16b (%rdi)
43-
; X64-AVX-NEXT: jne .LBB0_1
44-
; X64-AVX-NEXT: # %bb.2: # %atomicrmw.end
45-
; X64-AVX-NEXT: popq %rbx
46-
; X64-AVX-NEXT: .cfi_def_cfa_offset 8
31+
; X64-AVX-NEXT: vmovaps %xmm0, (%rdi)
4732
; X64-AVX-NEXT: retq
4833
store atomic fp128 %v, ptr %fptr unordered, align 16
4934
ret void
@@ -69,19 +54,9 @@ define fp128 @load_fp128(ptr %fptr) {
6954
;
7055
; X64-AVX-LABEL: load_fp128:
7156
; X64-AVX: # %bb.0:
72-
; X64-AVX-NEXT: pushq %rbx
73-
; X64-AVX-NEXT: .cfi_def_cfa_offset 16
74-
; X64-AVX-NEXT: .cfi_offset %rbx, -16
75-
; X64-AVX-NEXT: xorl %eax, %eax
76-
; X64-AVX-NEXT: xorl %edx, %edx
77-
; X64-AVX-NEXT: xorl %ecx, %ecx
78-
; X64-AVX-NEXT: xorl %ebx, %ebx
79-
; X64-AVX-NEXT: lock cmpxchg16b (%rdi)
80-
; X64-AVX-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
81-
; X64-AVX-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
57+
; X64-AVX-NEXT: vmovaps (%rdi), %xmm0
58+
; X64-AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
8259
; X64-AVX-NEXT: vmovaps -{{[0-9]+}}(%rsp), %xmm0
83-
; X64-AVX-NEXT: popq %rbx
84-
; X64-AVX-NEXT: .cfi_def_cfa_offset 8
8560
; X64-AVX-NEXT: retq
8661
%v = load atomic fp128, ptr %fptr unordered, align 16
8762
ret fp128 %v

llvm/test/CodeGen/X86/atomic-non-integer.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,6 @@ define void @store_double(ptr %fptr, double %v) {
131131
ret void
132132
}
133133

134-
135134
define half @load_half(ptr %fptr) {
136135
; X86-SSE1-LABEL: load_half:
137136
; X86-SSE1: # %bb.0:
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
; RUN: llc -mtriple=x86_64 < %s | FileCheck %s
2+
3+
; Quick test to ensure that atomics which are not naturally-aligned
4+
; emit unsized libcalls, and aren't emitted as native instructions or
5+
; sized libcalls.
6+
define void @test_i32(ptr %a) nounwind {
7+
; CHECK-LABEL: test_i32:
8+
; CHECK: callq __atomic_load
9+
; CHECK: callq __atomic_store
10+
; CHECK: callq __atomic_exchange
11+
; CHECK: callq __atomic_compare_exchange
12+
; CHECK: callq __atomic_compare_exchange
13+
%t0 = load atomic i32, ptr %a seq_cst, align 2
14+
store atomic i32 1, ptr %a seq_cst, align 2
15+
%t1 = atomicrmw xchg ptr %a, i32 1 seq_cst, align 2
16+
%t3 = atomicrmw add ptr %a, i32 2 seq_cst, align 2
17+
%t2 = cmpxchg ptr %a, i32 0, i32 1 seq_cst seq_cst, align 2
18+
ret void
19+
}
20+
21+
define void @test_i128(ptr %a) nounwind {
22+
; CHECK-LABEL: test_i128:
23+
; CHECK: callq __atomic_load
24+
; CHECK: callq __atomic_store
25+
; CHECK: callq __atomic_exchange
26+
; CHECK: callq __atomic_compare_exchange
27+
%t0 = load atomic i128, ptr %a seq_cst, align 8
28+
store atomic i128 1, ptr %a seq_cst, align 8
29+
%t1 = atomicrmw xchg ptr %a, i128 1 seq_cst, align 8
30+
%t2 = atomicrmw add ptr %a, i128 2 seq_cst, align 8
31+
%t3 = cmpxchg ptr %a, i128 0, i128 1 seq_cst seq_cst, align 8
32+
ret void
33+
}

llvm/test/CodeGen/X86/atomic-unordered.ll

Lines changed: 14 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -228,86 +228,31 @@ define void @widen_broadcast_unaligned(ptr %p0, i32 %v) {
228228
}
229229

230230
define i128 @load_i128(ptr %ptr) {
231-
; CHECK-O0-LABEL: load_i128:
232-
; CHECK-O0: # %bb.0:
233-
; CHECK-O0-NEXT: pushq %rbx
234-
; CHECK-O0-NEXT: .cfi_def_cfa_offset 16
235-
; CHECK-O0-NEXT: .cfi_offset %rbx, -16
236-
; CHECK-O0-NEXT: xorl %eax, %eax
237-
; CHECK-O0-NEXT: movl %eax, %ebx
238-
; CHECK-O0-NEXT: movq %rbx, %rax
239-
; CHECK-O0-NEXT: movq %rbx, %rdx
240-
; CHECK-O0-NEXT: movq %rbx, %rcx
241-
; CHECK-O0-NEXT: lock cmpxchg16b (%rdi)
242-
; CHECK-O0-NEXT: popq %rbx
243-
; CHECK-O0-NEXT: .cfi_def_cfa_offset 8
244-
; CHECK-O0-NEXT: retq
245-
;
246-
; CHECK-O3-LABEL: load_i128:
247-
; CHECK-O3: # %bb.0:
248-
; CHECK-O3-NEXT: pushq %rbx
249-
; CHECK-O3-NEXT: .cfi_def_cfa_offset 16
250-
; CHECK-O3-NEXT: .cfi_offset %rbx, -16
251-
; CHECK-O3-NEXT: xorl %eax, %eax
252-
; CHECK-O3-NEXT: xorl %edx, %edx
253-
; CHECK-O3-NEXT: xorl %ecx, %ecx
254-
; CHECK-O3-NEXT: xorl %ebx, %ebx
255-
; CHECK-O3-NEXT: lock cmpxchg16b (%rdi)
256-
; CHECK-O3-NEXT: popq %rbx
257-
; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
258-
; CHECK-O3-NEXT: retq
231+
; CHECK-LABEL: load_i128:
232+
; CHECK: # %bb.0:
233+
; CHECK-NEXT: vmovdqa (%rdi), %xmm0
234+
; CHECK-NEXT: vmovq %xmm0, %rax
235+
; CHECK-NEXT: vpextrq $1, %xmm0, %rdx
236+
; CHECK-NEXT: retq
259237
%v = load atomic i128, ptr %ptr unordered, align 16
260238
ret i128 %v
261239
}
262240

263241
define void @store_i128(ptr %ptr, i128 %v) {
264242
; CHECK-O0-LABEL: store_i128:
265243
; CHECK-O0: # %bb.0:
266-
; CHECK-O0-NEXT: pushq %rbx
267-
; CHECK-O0-NEXT: .cfi_def_cfa_offset 16
268-
; CHECK-O0-NEXT: .cfi_offset %rbx, -16
269-
; CHECK-O0-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
270-
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
271-
; CHECK-O0-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
272-
; CHECK-O0-NEXT: movq (%rdi), %rax
273-
; CHECK-O0-NEXT: movq 8(%rdi), %rdx
274-
; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
275-
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
276-
; CHECK-O0-NEXT: jmp .LBB16_1
277-
; CHECK-O0-NEXT: .LBB16_1: # %atomicrmw.start
278-
; CHECK-O0-NEXT: # =>This Inner Loop Header: Depth=1
279-
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
280-
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
281-
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
282-
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
283-
; CHECK-O0-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
284-
; CHECK-O0-NEXT: lock cmpxchg16b (%rsi)
285-
; CHECK-O0-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
286-
; CHECK-O0-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
287-
; CHECK-O0-NEXT: jne .LBB16_1
288-
; CHECK-O0-NEXT: jmp .LBB16_2
289-
; CHECK-O0-NEXT: .LBB16_2: # %atomicrmw.end
290-
; CHECK-O0-NEXT: popq %rbx
291-
; CHECK-O0-NEXT: .cfi_def_cfa_offset 8
244+
; CHECK-O0-NEXT: vmovq %rsi, %xmm0
245+
; CHECK-O0-NEXT: vmovq %rdx, %xmm1
246+
; CHECK-O0-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
247+
; CHECK-O0-NEXT: vmovdqa %xmm0, (%rdi)
292248
; CHECK-O0-NEXT: retq
293249
;
294250
; CHECK-O3-LABEL: store_i128:
295251
; CHECK-O3: # %bb.0:
296-
; CHECK-O3-NEXT: pushq %rbx
297-
; CHECK-O3-NEXT: .cfi_def_cfa_offset 16
298-
; CHECK-O3-NEXT: .cfi_offset %rbx, -16
299-
; CHECK-O3-NEXT: movq %rdx, %rcx
300-
; CHECK-O3-NEXT: movq %rsi, %rbx
301-
; CHECK-O3-NEXT: movq (%rdi), %rax
302-
; CHECK-O3-NEXT: movq 8(%rdi), %rdx
303-
; CHECK-O3-NEXT: .p2align 4, 0x90
304-
; CHECK-O3-NEXT: .LBB16_1: # %atomicrmw.start
305-
; CHECK-O3-NEXT: # =>This Inner Loop Header: Depth=1
306-
; CHECK-O3-NEXT: lock cmpxchg16b (%rdi)
307-
; CHECK-O3-NEXT: jne .LBB16_1
308-
; CHECK-O3-NEXT: # %bb.2: # %atomicrmw.end
309-
; CHECK-O3-NEXT: popq %rbx
310-
; CHECK-O3-NEXT: .cfi_def_cfa_offset 8
252+
; CHECK-O3-NEXT: vmovq %rdx, %xmm0
253+
; CHECK-O3-NEXT: vmovq %rsi, %xmm1
254+
; CHECK-O3-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
255+
; CHECK-O3-NEXT: vmovdqa %xmm0, (%rdi)
311256
; CHECK-O3-NEXT: retq
312257
store atomic i128 %v, ptr %ptr unordered, align 16
313258
ret void

0 commit comments

Comments
 (0)