Skip to content

Commit aea5980

Browse files
committed
Emit CAS loop for min/max atomics.
1 parent a64b2e9 commit aea5980

14 files changed

+17117
-1339
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12867,12 +12867,23 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
1286712867
return DenormMode == DenormalMode::getIEEE();
1286812868
}
1286912869

12870+
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
12871+
// floating point atomic instructions. May generate more efficient code,
12872+
// but may not respect rounding and denormal modes, and may give incorrect
12873+
// results for certain memory destinations.
12874+
bool unsafeFPAtomicsDisabled(Function *F) {
12875+
return F->getFnAttribute("amdgpu-unsafe-fp-atomics").getValueAsString() !=
12876+
"true";
12877+
}
12878+
1287012879
TargetLowering::AtomicExpansionKind
1287112880
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1287212881
unsigned AS = RMW->getPointerAddressSpace();
1287312882
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
1287412883
return AtomicExpansionKind::NotAtomic;
1287512884

12885+
auto SSID = RMW->getSyncScopeID();
12886+
1287612887
auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
1287712888
OptimizationRemarkEmitter ORE(RMW->getFunction());
1287812889
LLVMContext &Ctx = RMW->getFunction()->getContext();
@@ -12891,6 +12902,10 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1289112902
return Kind;
1289212903
};
1289312904

12905+
bool HasSystemScope =
12906+
SSID == SyncScope::System ||
12907+
SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
12908+
1289412909
switch (RMW->getOperation()) {
1289512910
case AtomicRMWInst::FAdd: {
1289612911
Type *Ty = RMW->getType();
@@ -12901,21 +12916,13 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1290112916
if (!Ty->isFloatTy() && (!Subtarget->hasGFX90AInsts() || !Ty->isDoubleTy()))
1290212917
return AtomicExpansionKind::CmpXChg;
1290312918

12904-
if ((AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) &&
12919+
if (AMDGPU::isFlatGlobalAddrSpace(AS) &&
1290512920
Subtarget->hasAtomicFaddNoRtnInsts()) {
12906-
// The amdgpu-unsafe-fp-atomics attribute enables generation of unsafe
12907-
// floating point atomic instructions. May generate more efficient code,
12908-
// but may not respect rounding and denormal modes, and may give incorrect
12909-
// results for certain memory destinations.
12910-
if (RMW->getFunction()
12911-
->getFnAttribute("amdgpu-unsafe-fp-atomics")
12912-
.getValueAsString() != "true")
12921+
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
1291312922
return AtomicExpansionKind::CmpXChg;
1291412923

1291512924
// Always expand system scope fp atomics.
12916-
auto SSID = RMW->getSyncScopeID();
12917-
if (SSID == SyncScope::System ||
12918-
SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
12925+
if (HasSystemScope)
1291912926
return AtomicExpansionKind::CmpXChg;
1292012927

1292112928
if (AS == AMDGPUAS::GLOBAL_ADDRESS && Ty->isFloatTy()) {
@@ -12971,6 +12978,23 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
1297112978

1297212979
return AtomicExpansionKind::CmpXChg;
1297312980
}
12981+
case AtomicRMWInst::FMin:
12982+
case AtomicRMWInst::FMax:
12983+
case AtomicRMWInst::Min:
12984+
case AtomicRMWInst::Max:
12985+
case AtomicRMWInst::UMin:
12986+
case AtomicRMWInst::UMax: {
12987+
if (AMDGPU::isFlatGlobalAddrSpace(AS)) {
12988+
if (RMW->getType()->isFloatTy() &&
12989+
unsafeFPAtomicsDisabled(RMW->getFunction()))
12990+
return AtomicExpansionKind::CmpXChg;
12991+
12992+
// Always expand system scope min/max atomics.
12993+
if (HasSystemScope)
12994+
return AtomicExpansionKind::CmpXChg;
12995+
}
12996+
break;
12997+
}
1297412998
default:
1297512999
break;
1297613000
}

llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll

Lines changed: 152 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,8 @@ define protected amdgpu_kernel void @nand(i32 addrspace(1)* %p, %S addrspace(1)*
151151
ret void
152152
}
153153

154-
define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
155-
; CHECK-LABEL: max:
154+
define protected amdgpu_kernel void @max_workgroup(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
155+
; CHECK-LABEL: max_workgroup:
156156
; CHECK: ; %bb.0:
157157
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
158158
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -165,6 +165,41 @@ define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)*
165165
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
166166
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
167167
; CHECK-NEXT: global_store_dword v[0:1], v2, off
168+
; CHECK-NEXT: s_endpgm
169+
%n32 = atomicrmw max i32 addrspace(1)* %p, i32 1 syncscope("workgroup") monotonic
170+
%n64 = zext i32 %n32 to i64
171+
%p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
172+
store float 1.0, float addrspace(1)* %p1
173+
ret void
174+
}
175+
176+
define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
177+
; CHECK-LABEL: max:
178+
; CHECK: ; %bb.0:
179+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
180+
; CHECK-NEXT: s_mov_b64 s[4:5], 0
181+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
182+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
183+
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
184+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
185+
; CHECK-NEXT: v_mov_b32_e32 v0, s6
186+
; CHECK-NEXT: .LBB7_1: ; %atomicrmw.start
187+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
188+
; CHECK-NEXT: v_mov_b32_e32 v3, v0
189+
; CHECK-NEXT: v_max_i32_e32 v2, 1, v3
190+
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
191+
; CHECK-NEXT: s_waitcnt vmcnt(0)
192+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
193+
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
194+
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
195+
; CHECK-NEXT: s_cbranch_execnz .LBB7_1
196+
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
197+
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
198+
; CHECK-NEXT: v_mov_b32_e32 v2, s2
199+
; CHECK-NEXT: v_mov_b32_e32 v3, s3
200+
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
201+
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
202+
; CHECK-NEXT: global_store_dword v[0:1], v2, off
168203
; CHECK-NEXT: s_endpgm
169204
%n32 = atomicrmw max i32 addrspace(1)* %p, i32 1 monotonic
170205
%n64 = zext i32 %n32 to i64
@@ -173,8 +208,8 @@ define protected amdgpu_kernel void @max(i32 addrspace(1)* %p, %S addrspace(1)*
173208
ret void
174209
}
175210

176-
define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
177-
; CHECK-LABEL: min:
211+
define protected amdgpu_kernel void @min_workgroup(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
212+
; CHECK-LABEL: min_workgroup:
178213
; CHECK: ; %bb.0:
179214
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
180215
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -187,6 +222,41 @@ define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)*
187222
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
188223
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
189224
; CHECK-NEXT: global_store_dword v[0:1], v2, off
225+
; CHECK-NEXT: s_endpgm
226+
%n32 = atomicrmw min i32 addrspace(1)* %p, i32 1 syncscope("workgroup") monotonic
227+
%n64 = zext i32 %n32 to i64
228+
%p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
229+
store float 1.0, float addrspace(1)* %p1
230+
ret void
231+
}
232+
233+
define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
234+
; CHECK-LABEL: min:
235+
; CHECK: ; %bb.0:
236+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
237+
; CHECK-NEXT: s_mov_b64 s[4:5], 0
238+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
239+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
240+
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
241+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
242+
; CHECK-NEXT: v_mov_b32_e32 v0, s6
243+
; CHECK-NEXT: .LBB9_1: ; %atomicrmw.start
244+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
245+
; CHECK-NEXT: v_mov_b32_e32 v3, v0
246+
; CHECK-NEXT: v_min_i32_e32 v2, 1, v3
247+
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
248+
; CHECK-NEXT: s_waitcnt vmcnt(0)
249+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
250+
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
251+
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
252+
; CHECK-NEXT: s_cbranch_execnz .LBB9_1
253+
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
254+
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
255+
; CHECK-NEXT: v_mov_b32_e32 v2, s2
256+
; CHECK-NEXT: v_mov_b32_e32 v3, s3
257+
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
258+
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
259+
; CHECK-NEXT: global_store_dword v[0:1], v2, off
190260
; CHECK-NEXT: s_endpgm
191261
%n32 = atomicrmw min i32 addrspace(1)* %p, i32 1 monotonic
192262
%n64 = zext i32 %n32 to i64
@@ -195,8 +265,8 @@ define protected amdgpu_kernel void @min(i32 addrspace(1)* %p, %S addrspace(1)*
195265
ret void
196266
}
197267

198-
define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
199-
; CHECK-LABEL: umax:
268+
define protected amdgpu_kernel void @umax_workgroup(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
269+
; CHECK-LABEL: umax_workgroup:
200270
; CHECK: ; %bb.0:
201271
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
202272
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -209,6 +279,41 @@ define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)*
209279
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
210280
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
211281
; CHECK-NEXT: global_store_dword v[0:1], v2, off
282+
; CHECK-NEXT: s_endpgm
283+
%n32 = atomicrmw umax i32 addrspace(1)* %p, i32 1 syncscope("workgroup") monotonic
284+
%n64 = zext i32 %n32 to i64
285+
%p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
286+
store float 1.0, float addrspace(1)* %p1
287+
ret void
288+
}
289+
290+
define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
291+
; CHECK-LABEL: umax:
292+
; CHECK: ; %bb.0:
293+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
294+
; CHECK-NEXT: s_mov_b64 s[4:5], 0
295+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
296+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
297+
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
298+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
299+
; CHECK-NEXT: v_mov_b32_e32 v0, s6
300+
; CHECK-NEXT: .LBB11_1: ; %atomicrmw.start
301+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
302+
; CHECK-NEXT: v_mov_b32_e32 v3, v0
303+
; CHECK-NEXT: v_max_u32_e32 v2, 1, v3
304+
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
305+
; CHECK-NEXT: s_waitcnt vmcnt(0)
306+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
307+
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
308+
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
309+
; CHECK-NEXT: s_cbranch_execnz .LBB11_1
310+
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
311+
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
312+
; CHECK-NEXT: v_mov_b32_e32 v2, s2
313+
; CHECK-NEXT: v_mov_b32_e32 v3, s3
314+
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
315+
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
316+
; CHECK-NEXT: global_store_dword v[0:1], v2, off
212317
; CHECK-NEXT: s_endpgm
213318
%n32 = atomicrmw umax i32 addrspace(1)* %p, i32 1 monotonic
214319
%n64 = zext i32 %n32 to i64
@@ -217,8 +322,8 @@ define protected amdgpu_kernel void @umax(i32 addrspace(1)* %p, %S addrspace(1)*
217322
ret void
218323
}
219324

220-
define protected amdgpu_kernel void @umin(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
221-
; CHECK-LABEL: umin:
325+
define protected amdgpu_kernel void @umin_workgroup(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
326+
; CHECK-LABEL: umin_workgroup:
222327
; CHECK: ; %bb.0:
223328
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
224329
; CHECK-NEXT: v_mov_b32_e32 v0, 0
@@ -231,6 +336,41 @@ define protected amdgpu_kernel void @umin(i32 addrspace(1)* %p, %S addrspace(1)*
231336
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v2, 12, v[0:1]
232337
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
233338
; CHECK-NEXT: global_store_dword v[0:1], v2, off
339+
; CHECK-NEXT: s_endpgm
340+
%n32 = atomicrmw umin i32 addrspace(1)* %p, i32 1 syncscope("workgroup") monotonic
341+
%n64 = zext i32 %n32 to i64
342+
%p1 = getelementptr inbounds %S, %S addrspace(1)* %q, i64 %n64, i32 0
343+
store float 1.0, float addrspace(1)* %p1
344+
ret void
345+
}
346+
347+
define protected amdgpu_kernel void @umin(i32 addrspace(1)* %p, %S addrspace(1)* %q) {
348+
; CHECK-LABEL: umin:
349+
; CHECK: ; %bb.0:
350+
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
351+
; CHECK-NEXT: s_mov_b64 s[4:5], 0
352+
; CHECK-NEXT: v_mov_b32_e32 v1, 0
353+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
354+
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
355+
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
356+
; CHECK-NEXT: v_mov_b32_e32 v0, s6
357+
; CHECK-NEXT: .LBB13_1: ; %atomicrmw.start
358+
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
359+
; CHECK-NEXT: v_mov_b32_e32 v3, v0
360+
; CHECK-NEXT: v_min_u32_e32 v2, 1, v3
361+
; CHECK-NEXT: global_atomic_cmpswap v0, v1, v[2:3], s[0:1] glc
362+
; CHECK-NEXT: s_waitcnt vmcnt(0)
363+
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
364+
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
365+
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
366+
; CHECK-NEXT: s_cbranch_execnz .LBB13_1
367+
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
368+
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
369+
; CHECK-NEXT: v_mov_b32_e32 v2, s2
370+
; CHECK-NEXT: v_mov_b32_e32 v3, s3
371+
; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[0:1], v0, 12, v[2:3]
372+
; CHECK-NEXT: v_mov_b32_e32 v2, 1.0
373+
; CHECK-NEXT: global_store_dword v[0:1], v2, off
234374
; CHECK-NEXT: s_endpgm
235375
%n32 = atomicrmw umin i32 addrspace(1)* %p, i32 1 monotonic
236376
%n64 = zext i32 %n32 to i64
@@ -337,7 +477,7 @@ define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1
337477
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
338478
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
339479
; CHECK-NEXT: v_mov_b32_e32 v0, s6
340-
; CHECK-NEXT: .LBB14_1: ; %atomicrmw.start
480+
; CHECK-NEXT: .LBB18_1: ; %atomicrmw.start
341481
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
342482
; CHECK-NEXT: v_mov_b32_e32 v3, v0
343483
; CHECK-NEXT: v_add_f32_e32 v2, 1.0, v3
@@ -346,7 +486,7 @@ define protected amdgpu_kernel void @fadd(float addrspace(1)* %p, %S addrspace(1
346486
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
347487
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
348488
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
349-
; CHECK-NEXT: s_cbranch_execnz .LBB14_1
489+
; CHECK-NEXT: s_cbranch_execnz .LBB18_1
350490
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
351491
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
352492
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0
@@ -374,7 +514,7 @@ define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1
374514
; CHECK-NEXT: s_load_dword s6, s[0:1], 0x0
375515
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
376516
; CHECK-NEXT: v_mov_b32_e32 v0, s6
377-
; CHECK-NEXT: .LBB15_1: ; %atomicrmw.start
517+
; CHECK-NEXT: .LBB19_1: ; %atomicrmw.start
378518
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
379519
; CHECK-NEXT: v_mov_b32_e32 v3, v0
380520
; CHECK-NEXT: v_add_f32_e32 v2, -1.0, v3
@@ -383,7 +523,7 @@ define protected amdgpu_kernel void @fsub(float addrspace(1)* %p, %S addrspace(1
383523
; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
384524
; CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
385525
; CHECK-NEXT: s_andn2_b64 exec, exec, s[4:5]
386-
; CHECK-NEXT: s_cbranch_execnz .LBB15_1
526+
; CHECK-NEXT: s_cbranch_execnz .LBB19_1
387527
; CHECK-NEXT: ; %bb.2: ; %atomicrmw.end
388528
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
389529
; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v0

0 commit comments

Comments
 (0)