Skip to content

Commit 3cee196

Browse files
committed
[OpenMP] MI300 specific fast FP atomics for 6.1
Land fast FP Atomics in 6.1 =========== Revert "Revert "[OpenMP] MI300 specific fast FP atomics"" This reverts commit e47a617. Also fix SWDEV-443900 to enable correct reverting. Change-Id: Idb5e428c1eb59e372b5d6333b674ce4fa3ca0cb7
1 parent 2efecb6 commit 3cee196

File tree

2 files changed

+111
-1
lines changed

2 files changed

+111
-1
lines changed

clang/lib/CodeGen/CGStmtOpenMP.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6311,6 +6311,9 @@ static bool canUseAMDGPUFastFPAtomics(CodeGenFunction &CGF, LValue X,
63116311
if (!Update.isScalar())
63126312
return false;
63136313

6314+
if (!X.isSimple())
6315+
return false;
6316+
63146317
ASTContext &Context = CGF.getContext();
63156318

63166319
// Handle fast FP atomics for AMDGPU target (call intrinsic)
@@ -6342,6 +6345,11 @@ static bool canUseAMDGPUFastFPAtomics(CodeGenFunction &CGF, LValue X,
63426345
}
63436346
}
63446347

6348+
// Fast FP atomics only work when the Update type is the same as the target X.
6349+
// If not, rever to atomicxchg and warn the user.
6350+
bool hasXandUpdateSameType =
6351+
(Update.getScalarVal()->getType() == X.getAddress(CGF).getElementType());
6352+
63456353
bool addOpHasAMDGPUFastVersion =
63466354
BO == BO_Add && (Update.getScalarVal()->getType()->isDoubleTy() ||
63476355
Update.getScalarVal()->getType()->isFloatTy());
@@ -6355,7 +6363,7 @@ static bool canUseAMDGPUFastFPAtomics(CodeGenFunction &CGF, LValue X,
63556363
CGF.CGM.getLangOpts().OpenMPIsTargetDevice &&
63566364
userRequestsAMDGPUFastFPAtomics &&
63576365
(addOpHasAMDGPUFastVersion || minMaxOpHasAMDGPUFastVersion) &&
6358-
X.isSimple();
6366+
hasXandUpdateSameType && X.isSimple();
63596367
}
63606368

63616369
static std::pair<bool, RValue>
Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,102 @@
1+
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 4
2+
// REQUIRES: amdgpu-registered-target
3+
4+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
5+
// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-target-device -target-cpu gfx942 -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s
6+
7+
#ifndef HEADER
8+
#define HEADER
9+
10+
#define N 10000;
11+
12+
#define AMD_safe_fp_atomics 1 << 19
13+
#define AMD_unsafe_fp_atomics 1 << 20
14+
15+
int main(){
16+
17+
float sum = 0.0;
18+
19+
#pragma omp target map(tofrom: sum)
20+
{
21+
#pragma omp atomic hint(AMD_safe_fp_atomics)
22+
sum+=(float)1.0;
23+
}
24+
25+
#pragma omp target map(tofrom: sum)
26+
{
27+
#pragma omp atomic hint(AMD_unsafe_fp_atomics)
28+
sum+=(float)1.0;
29+
}
30+
31+
#pragma omp target map(tofrom: sum)
32+
{
33+
#pragma omp atomic
34+
sum+=(float)1.0;
35+
}
36+
37+
return 1;
38+
}
39+
40+
#endif
41+
// CHECK-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19(
42+
// CHECK-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0:[0-9]+]] {
43+
// CHECK-NEXT: entry:
44+
// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
45+
// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
46+
// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
47+
// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
48+
// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
49+
// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
50+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
51+
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l19_kernel_environment to ptr), ptr [[DYN_PTR]])
52+
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
53+
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
54+
// CHECK: user_code.entry:
55+
// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2:[0-9]+]]
56+
// CHECK-NEXT: call void @__kmpc_target_deinit()
57+
// CHECK-NEXT: ret void
58+
// CHECK: worker.exit:
59+
// CHECK-NEXT: ret void
60+
//
61+
//
62+
// CHECK-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25(
63+
// CHECK-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0]] {
64+
// CHECK-NEXT: entry:
65+
// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
66+
// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
67+
// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
68+
// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
69+
// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
70+
// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
71+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
72+
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]])
73+
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
74+
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
75+
// CHECK: user_code.entry:
76+
// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2]]
77+
// CHECK-NEXT: call void @__kmpc_target_deinit()
78+
// CHECK-NEXT: ret void
79+
// CHECK: worker.exit:
80+
// CHECK-NEXT: ret void
81+
//
82+
//
83+
// CHECK-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31(
84+
// CHECK-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM:%.*]]) #[[ATTR0]] {
85+
// CHECK-NEXT: entry:
86+
// CHECK-NEXT: [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
87+
// CHECK-NEXT: [[SUM_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
88+
// CHECK-NEXT: [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
89+
// CHECK-NEXT: [[SUM_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[SUM_ADDR]] to ptr
90+
// CHECK-NEXT: store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
91+
// CHECK-NEXT: store ptr [[SUM]], ptr [[SUM_ADDR_ASCAST]], align 8
92+
// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SUM_ADDR_ASCAST]], align 8
93+
// CHECK-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l31_kernel_environment to ptr), ptr [[DYN_PTR]])
94+
// CHECK-NEXT: [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
95+
// CHECK-NEXT: br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
96+
// CHECK: user_code.entry:
97+
// CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr [[TMP0]], float 1.000000e+00) #[[ATTR2]]
98+
// CHECK-NEXT: call void @__kmpc_target_deinit()
99+
// CHECK-NEXT: ret void
100+
// CHECK: worker.exit:
101+
// CHECK-NEXT: ret void
102+
//

0 commit comments

Comments
 (0)