llvm
diff --git a/‎llvm/test/CodeGen/X86/atomic-nocx16.ll
Lines changed: 49 additions & 0 deletions b/‎llvm/test/CodeGen/X86/atomic-nocx16.ll
Lines changed: 49 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
Lines changed: 128 additions & 24 deletions b/‎llvm/test/CodeGen/X86/atomic-non-integer-fp128.ll
Lines changed: 128 additions & 24 deletions
@@ -0,0 +1,49 @@
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=corei7 -mattr=-cx16 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-linux-gnu -verify-machineinstrs -mattr=cx16 | FileCheck -check-prefix=CHECK %s
+
+;; Verify that 128-bit atomics emit a libcall without cx16
+;; available.
+;;
+;; We test 32-bit mode with -mattr=cx16, because it should have no
+;; effect for 32-bit mode.
+
+; CHECK-LABEL: test:
+define void @test(ptr %a) nounwind {
+entry:
+; CHECK: __sync_val_compare_and_swap_16
+  %0 = cmpxchg ptr %a, i128 1, i128 1 seq_cst seq_cst
+; CHECK: __sync_lock_test_and_set_16
+  %1 = atomicrmw xchg ptr %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_add_16
+  %2 = atomicrmw add ptr %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_sub_16
+  %3 = atomicrmw sub ptr %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_and_16
+  %4 = atomicrmw and ptr %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_nand_16
+  %5 = atomicrmw nand ptr %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_or_16
+  %6 = atomicrmw or ptr %a, i128 1 seq_cst
+; CHECK: __sync_fetch_and_xor_16
+  %7 = atomicrmw xor ptr %a, i128 1 seq_cst
+; CHECK: __sync_val_compare_and_swap_16
+  %8 = load atomic i128, ptr %a seq_cst, align 16
+; CHECK: __sync_lock_test_and_set_16
+  store atomic i128 %8, ptr %a seq_cst, align 16
+  ret void
+}
+
+; CHECK-LABEL: test_fp:
+define void @test_fp(fp128* %a) nounwind {
+entry:
+; CHECK: __sync_lock_test_and_set_16
+  %0 = atomicrmw xchg fp128* %a, fp128 0xL00000000000000004000900000000000 seq_cst
+; Currently fails to compile:
+;  %1 = atomicrmw fadd fp128* %a, fp128 0xL00000000000000004000900000000000 seq_cst
+;  %2 = atomicrmw fsub fp128* %a, fp128 0xL00000000000000004000900000000000 seq_cst
+; CHECK: __sync_val_compare_and_swap_16
+  %1 = load atomic fp128, fp128* %a seq_cst, align 16
+; CHECK: __sync_lock_test_and_set_16
+  store atomic fp128 %1, fp128* %a seq_cst, align 16
+  ret void
+}
@@ -1,35 +1,139 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=-sse | FileCheck %s --check-prefix=X64-NOSSE
-; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs | FileCheck %s --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=sse2,cx16 | FileCheck %s --check-prefixes=X64-SSE
+; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx,cx16 | FileCheck %s --check-prefixes=X64-AVX
+; RUN: llc < %s -mtriple=x86_64-linux-generic -verify-machineinstrs -mattr=avx512f,cx16 | FileCheck %s --check-prefixes=X64-AVX
 
-; Note: This test is testing that the lowering for atomics matches what we
-; currently emit for non-atomics + the atomic restriction.  The presence of
-; particular lowering detail in these tests should not be read as requiring
-; that detail for correctness unless it's related to the atomicity itself.
-; (Specifically, there were reviewer questions about the lowering for halfs
-;  and their calling convention which remain unresolved.)
+; Codegen of fp128 without cx16 is tested in atomic-nocx16.ll
 
 define void @store_fp128(ptr %fptr, fp128 %v) {
-; X64-NOSSE-LABEL: store_fp128:
-; X64-NOSSE:       # %bb.0:
-; X64-NOSSE-NEXT:    pushq %rax
-; X64-NOSSE-NEXT:    .cfi_def_cfa_offset 16
-; X64-NOSSE-NEXT:    callq __sync_lock_test_and_set_16@PLT
-; X64-NOSSE-NEXT:    popq %rax
-; X64-NOSSE-NEXT:    .cfi_def_cfa_offset 8
-; X64-NOSSE-NEXT:    retq
-;
 ; X64-SSE-LABEL: store_fp128:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    subq $24, %rsp
-; X64-SSE-NEXT:    .cfi_def_cfa_offset 32
-; X64-SSE-NEXT:    movaps %xmm0, (%rsp)
-; X64-SSE-NEXT:    movq (%rsp), %rsi
-; X64-SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-SSE-NEXT:    callq __sync_lock_test_and_set_16@PLT
-; X64-SSE-NEXT:    addq $24, %rsp
+; X64-SSE-NEXT:    pushq %rbx
+; X64-SSE-NEXT:    .cfi_def_cfa_offset 16
+; X64-SSE-NEXT:    .cfi_offset %rbx, -16
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-SSE-NEXT:    movq (%rdi), %rax
+; X64-SSE-NEXT:    movq 8(%rdi), %rdx
+; X64-SSE-NEXT:    .p2align 4, 0x90
+; X64-SSE-NEXT:  .LBB0_1: # %atomicrmw.start
+; X64-SSE-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-SSE-NEXT:    lock cmpxchg16b (%rdi)
+; X64-SSE-NEXT:    jne .LBB0_1
+; X64-SSE-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-SSE-NEXT:    popq %rbx
 ; X64-SSE-NEXT:    .cfi_def_cfa_offset 8
 ; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: store_fp128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    pushq %rbx
+; X64-AVX-NEXT:    .cfi_def_cfa_offset 16
+; X64-AVX-NEXT:    .cfi_offset %rbx, -16
+; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-AVX-NEXT:    movq (%rdi), %rax
+; X64-AVX-NEXT:    movq 8(%rdi), %rdx
+; X64-AVX-NEXT:    .p2align 4, 0x90
+; X64-AVX-NEXT:  .LBB0_1: # %atomicrmw.start
+; X64-AVX-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-AVX-NEXT:    lock cmpxchg16b (%rdi)
+; X64-AVX-NEXT:    jne .LBB0_1
+; X64-AVX-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-AVX-NEXT:    popq %rbx
+; X64-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X64-AVX-NEXT:    retq
   store atomic fp128 %v, ptr %fptr unordered, align 16
   ret void
 }
+
+define fp128 @load_fp128(ptr %fptr) {
+; X64-SSE-LABEL: load_fp128:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rbx
+; X64-SSE-NEXT:    .cfi_def_cfa_offset 16
+; X64-SSE-NEXT:    .cfi_offset %rbx, -16
+; X64-SSE-NEXT:    xorl %eax, %eax
+; X64-SSE-NEXT:    xorl %edx, %edx
+; X64-SSE-NEXT:    xorl %ecx, %ecx
+; X64-SSE-NEXT:    xorl %ebx, %ebx
+; X64-SSE-NEXT:    lock cmpxchg16b (%rdi)
+; X64-SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-SSE-NEXT:    popq %rbx
+; X64-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: load_fp128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    pushq %rbx
+; X64-AVX-NEXT:    .cfi_def_cfa_offset 16
+; X64-AVX-NEXT:    .cfi_offset %rbx, -16
+; X64-AVX-NEXT:    xorl %eax, %eax
+; X64-AVX-NEXT:    xorl %edx, %edx
+; X64-AVX-NEXT:    xorl %ecx, %ecx
+; X64-AVX-NEXT:    xorl %ebx, %ebx
+; X64-AVX-NEXT:    lock cmpxchg16b (%rdi)
+; X64-AVX-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-AVX-NEXT:    popq %rbx
+; X64-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X64-AVX-NEXT:    retq
+  %v = load atomic fp128, ptr %fptr unordered, align 16
+  ret fp128 %v
+}
+
+define fp128 @exchange_fp128(ptr %fptr, fp128 %x) {
+; X64-SSE-LABEL: exchange_fp128:
+; X64-SSE:       # %bb.0:
+; X64-SSE-NEXT:    pushq %rbx
+; X64-SSE-NEXT:    .cfi_def_cfa_offset 16
+; X64-SSE-NEXT:    .cfi_offset %rbx, -16
+; X64-SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx
+; X64-SSE-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-SSE-NEXT:    movq (%rdi), %rax
+; X64-SSE-NEXT:    movq 8(%rdi), %rdx
+; X64-SSE-NEXT:    .p2align 4, 0x90
+; X64-SSE-NEXT:  .LBB2_1: # %atomicrmw.start
+; X64-SSE-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-SSE-NEXT:    lock cmpxchg16b (%rdi)
+; X64-SSE-NEXT:    jne .LBB2_1
+; X64-SSE-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-SSE-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-SSE-NEXT:    popq %rbx
+; X64-SSE-NEXT:    .cfi_def_cfa_offset 8
+; X64-SSE-NEXT:    retq
+;
+; X64-AVX-LABEL: exchange_fp128:
+; X64-AVX:       # %bb.0:
+; X64-AVX-NEXT:    pushq %rbx
+; X64-AVX-NEXT:    .cfi_def_cfa_offset 16
+; X64-AVX-NEXT:    .cfi_offset %rbx, -16
+; X64-AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx
+; X64-AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rcx
+; X64-AVX-NEXT:    movq (%rdi), %rax
+; X64-AVX-NEXT:    movq 8(%rdi), %rdx
+; X64-AVX-NEXT:    .p2align 4, 0x90
+; X64-AVX-NEXT:  .LBB2_1: # %atomicrmw.start
+; X64-AVX-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-AVX-NEXT:    lock cmpxchg16b (%rdi)
+; X64-AVX-NEXT:    jne .LBB2_1
+; X64-AVX-NEXT:  # %bb.2: # %atomicrmw.end
+; X64-AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovaps -{{[0-9]+}}(%rsp), %xmm0
+; X64-AVX-NEXT:    popq %rbx
+; X64-AVX-NEXT:    .cfi_def_cfa_offset 8
+; X64-AVX-NEXT:    retq
+  %v = atomicrmw xchg ptr %fptr, fp128 %x monotonic, align 16
+  ret fp128 %v
+}
+