Use monotonic ordering

arsenm · arsenm · commit 74aa1ba330ff · 2024-07-24T08:39:24.000+04:00
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -19290,7 +19290,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
       // The global/flat cases need to use agent scope to consistently produce
       // the native instruction instead of a cmpxchg expansion.
       SSID = getLLVMContext().getOrInsertSyncScopeID("agent");
-      AO = AtomicOrdering::SequentiallyConsistent;
+      AO = AtomicOrdering::Monotonic;
 
       // The v2bf16 builtin uses i16 instead of a natural bfloat type.
       if (BuiltinID == AMDGPU::BI__builtin_amdgcn_ds_atomic_fadd_v2bf16) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx11.cl
@@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {
 }
 
 // CHECK-LABEL: @test_global_add_f32
-// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
 void test_global_add_f32(float *rtn, global float *addr, float x) {
   *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx12.cl
@@ -11,7 +11,7 @@ typedef short __attribute__((ext_vector_type(2))) short2;
 
 // CHECK-LABEL: test_local_add_2bf16
 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
 
 // GFX12-LABEL:  test_local_add_2bf16
@@ -22,7 +22,7 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
 
 // CHECK-LABEL: test_local_add_2bf16_noret
 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4
 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
 
 // GFX12-LABEL:  test_local_add_2bf16_noret
@@ -32,15 +32,15 @@ void test_local_add_2bf16_noret(__local short2 *addr, short2 x) {
 }
 
 // CHECK-LABEL: test_local_add_2f16
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
 // GFX12-LABEL:  test_local_add_2f16
 // GFX12: ds_pk_add_rtn_f16
 half2 test_local_add_2f16(__local half2 *addr, half2 x) {
   return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
 }
 
 // CHECK-LABEL: test_local_add_2f16_noret
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
 // GFX12-LABEL:  test_local_add_2f16_noret
 // GFX12: ds_pk_add_f16
 void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx8.cl
@@ -6,7 +6,7 @@
 // REQUIRES: amdgpu-registered-target
 
 // CHECK-LABEL: test_fadd_local
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4
 // GFX8-LABEL: test_fadd_local$local:
 // GFX8: ds_add_rtn_f32 v2, v0, v1
 // GFX8: s_endpgm
@@ -16,7 +16,7 @@ kernel void test_fadd_local(__local float *ptr, float val){
 }
 
 // CHECK-LABEL: test_fadd_local_volatile
-// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw volatile fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4
 kernel void test_fadd_local_volatile(volatile __local float *ptr, float val){
     volatile float *res;
     *res = __builtin_amdgcn_ds_atomic_fadd_f32(ptr, val);
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx90a.cl
@@ -9,7 +9,7 @@
 typedef half __attribute__((ext_vector_type(2))) half2;
 
 // CHECK-LABEL: test_global_add_f64
-// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, double %{{.+}} syncscope("agent") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 // GFX90A-LABEL:  test_global_add_f64$local:
 // GFX90A:  global_atomic_add_f64
 void test_global_add_f64(__global double *addr, double x) {
@@ -99,7 +99,7 @@ void test_flat_global_max_f64(__global double *addr, double x){
 }
 
 // CHECK-LABEL: test_ds_add_local_f64
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} seq_cst, align 8
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, double %{{.+}} monotonic, align 8
 // GFX90A:  test_ds_add_local_f64$local
 // GFX90A:  ds_add_rtn_f64
 void test_ds_add_local_f64(__local double *addr, double x){
@@ -108,7 +108,7 @@ void test_ds_add_local_f64(__local double *addr, double x){
 }
 
 // CHECK-LABEL: test_ds_addf_local_f32
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, float %{{.+}} monotonic, align 4
 // GFX90A-LABEL:  test_ds_addf_local_f32$local
 // GFX90A:  ds_add_rtn_f32
 void test_ds_addf_local_f32(__local float *addr, float x){
@@ -117,7 +117,7 @@ void test_ds_addf_local_f32(__local float *addr, float x){
 }
 
 // CHECK-LABEL: @test_global_add_f32
-// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
 void test_global_add_f32(float *rtn, global float *addr, float x) {
   *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl
@@ -44,7 +44,7 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) {
 // CHECK-LABEL: test_local_add_2bf16
 
 // CHECK: [[BC0:%.+]] = bitcast <2 x i16> {{.+}} to <2 x bfloat>
-// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") seq_cst, align 4{{$}}
+// CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4{{$}}
 // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16>
 
 // GFX940-LABEL:  test_local_add_2bf16
@@ -54,23 +54,23 @@ short2 test_local_add_2bf16(__local short2 *addr, short2 x) {
 }
 
 // CHECK-LABEL: test_local_add_2f16
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
 // GFX940-LABEL:  test_local_add_2f16
 // GFX940: ds_pk_add_rtn_f16
 half2 test_local_add_2f16(__local half2 *addr, half2 x) {
   return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
 }
 
 // CHECK-LABEL: test_local_add_2f16_noret
-// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} seq_cst, align 4
+// CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4
 // GFX940-LABEL:  test_local_add_2f16_noret
 // GFX940: ds_pk_add_f16
 void test_local_add_2f16_noret(__local half2 *addr, half2 x) {
   __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x);
 }
 
 // CHECK-LABEL: @test_global_add_f32
-// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+// CHECK: = atomicrmw fadd ptr addrspace(1) %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
 void test_global_add_f32(float *rtn, global float *addr, float x) {
   *rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);
 }

Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ void test_s_wait_event_export_ready() {`
`49`	`49`	`}`
`50`	`50`
`51`	`51`	`// CHECK-LABEL: @test_global_add_f32`
`52`		`-// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}`
	`52`	`+// CHECK: = atomicrmw fadd ptr addrspace(1) %addr, float %x syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}`
`53`	`53`	`void test_global_add_f32(float rtn, global float addr, float x) {`
`54`	`54`	`*rtn = __builtin_amdgcn_global_atomic_fadd_f32(addr, x);`
`55`	`55`	`}`