Skip to content

Commit 88cebe1

Browse files
committed
review comments
1 parent 8b8b6c8 commit 88cebe1

11 files changed

+25740
-6519
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,8 @@ static bool isOptimizableAtomic(Type *Ty) {
184184
case Type::DoubleTyID:
185185
return true;
186186
case Type::IntegerTyID: {
187-
unsigned size = Ty->getIntegerBitWidth();
188-
if (size == 32 || size == 64)
189-
return true;
187+
unsigned Size = Ty->getIntegerBitWidth();
188+
return (Size == 32 || Size == 64);
190189
}
191190
default:
192191
return false;
@@ -243,10 +242,14 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
243242

244243
// If the value operand is divergent, each lane is contributing a different
245244
// value to the atomic calculation. We can only optimize divergent values if
246-
// we have DPP available on our subtarget, and the atomic operation is 32
247-
// bits.
248-
if (ValDivergent && (!ST->hasDPP() || !isOptimizableAtomic(I.getType()))) {
249-
return;
245+
// we have DPP available on our subtarget (for DPP strategy), and the atomic
246+
// operation is 32 or 64 bits.
247+
if (ValDivergent) {
248+
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
249+
return;
250+
251+
if (!isOptimizableAtomic(I.getType()))
252+
return;
250253
}
251254

252255
// If we get here, we can optimize the atomic using a single wavefront-wide
@@ -325,10 +328,14 @@ void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
325328

326329
// If the value operand is divergent, each lane is contributing a different
327330
// value to the atomic calculation. We can only optimize divergent values if
328-
// we have DPP available on our subtarget, and the atomic operation is 32
329-
// bits.
330-
if (ValDivergent && (!ST->hasDPP() || !isOptimizableAtomic(I.getType()))) {
331-
return;
331+
// we have DPP available on our subtarget (for DPP strategy), and the atomic
332+
// operation is 32 or 64 bits.
333+
if (ValDivergent) {
334+
if (ScanImpl == ScanOptions::DPP && !ST->hasDPP())
335+
return;
336+
337+
if (!isOptimizableAtomic(I.getType()))
338+
return;
332339
}
333340

334341
// If any of the other arguments to the intrinsic are divergent, we can't

llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll

Lines changed: 104 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -571,13 +571,44 @@ entry:
571571
define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
572572
; GFX6-LABEL: add_i32_varying_vdata:
573573
; GFX6: ; %bb.0: ; %entry
574-
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
575-
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
574+
; GFX6-NEXT: s_mov_b64 s[2:3], exec
575+
; GFX6-NEXT: s_mov_b32 s4, 0
576+
; GFX6-NEXT: ; implicit-def: $vgpr1
577+
; GFX6-NEXT: .LBB2_1: ; %ComputeLoop
578+
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
579+
; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
580+
; GFX6-NEXT: s_mov_b32 m0, s5
581+
; GFX6-NEXT: v_readlane_b32 s8, v0, s5
582+
; GFX6-NEXT: v_writelane_b32 v1, s4, m0
583+
; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
584+
; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
585+
; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
586+
; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
587+
; GFX6-NEXT: s_add_i32 s4, s4, s8
588+
; GFX6-NEXT: s_cbranch_vccnz .LBB2_1
589+
; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
590+
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
591+
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
592+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
593+
; GFX6-NEXT: ; implicit-def: $vgpr0
594+
; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
595+
; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
596+
; GFX6-NEXT: s_cbranch_execz .LBB2_4
597+
; GFX6-NEXT: ; %bb.3:
598+
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
599+
; GFX6-NEXT: v_mov_b32_e32 v0, s4
576600
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
577-
; GFX6-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
601+
; GFX6-NEXT: buffer_atomic_add v0, off, s[8:11], 0 glc
602+
; GFX6-NEXT: .LBB2_4:
603+
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
604+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
578605
; GFX6-NEXT: s_mov_b32 s3, 0xf000
579606
; GFX6-NEXT: s_mov_b32 s2, -1
580607
; GFX6-NEXT: s_waitcnt vmcnt(0)
608+
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
609+
; GFX6-NEXT: s_waitcnt expcnt(0)
610+
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
611+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
581612
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
582613
; GFX6-NEXT: s_endpgm
583614
;
@@ -924,15 +955,46 @@ entry:
924955
define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout, i32 %vindex) {
925956
; GFX6-LABEL: struct_add_i32_varying_vdata:
926957
; GFX6: ; %bb.0: ; %entry
927-
; GFX6-NEXT: s_load_dword s2, s[0:1], 0x11
928-
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
929-
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
958+
; GFX6-NEXT: s_mov_b64 s[2:3], exec
959+
; GFX6-NEXT: s_mov_b32 s4, 0
960+
; GFX6-NEXT: ; implicit-def: $vgpr1
961+
; GFX6-NEXT: .LBB3_1: ; %ComputeLoop
962+
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
963+
; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
964+
; GFX6-NEXT: s_mov_b32 m0, s5
965+
; GFX6-NEXT: v_readlane_b32 s8, v0, s5
966+
; GFX6-NEXT: v_writelane_b32 v1, s4, m0
967+
; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
968+
; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
969+
; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
970+
; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
971+
; GFX6-NEXT: s_add_i32 s4, s4, s8
972+
; GFX6-NEXT: s_cbranch_vccnz .LBB3_1
973+
; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
974+
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
975+
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
976+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
977+
; GFX6-NEXT: ; implicit-def: $vgpr0
978+
; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
979+
; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
980+
; GFX6-NEXT: s_cbranch_execz .LBB3_4
981+
; GFX6-NEXT: ; %bb.3:
982+
; GFX6-NEXT: s_load_dword s5, s[0:1], 0x11
983+
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
984+
; GFX6-NEXT: v_mov_b32_e32 v0, s4
930985
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
931-
; GFX6-NEXT: v_mov_b32_e32 v1, s2
932-
; GFX6-NEXT: buffer_atomic_add v0, v1, s[4:7], 0 idxen glc
986+
; GFX6-NEXT: v_mov_b32_e32 v2, s5
987+
; GFX6-NEXT: buffer_atomic_add v0, v2, s[8:11], 0 idxen glc
988+
; GFX6-NEXT: .LBB3_4:
989+
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
990+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
933991
; GFX6-NEXT: s_mov_b32 s3, 0xf000
934992
; GFX6-NEXT: s_mov_b32 s2, -1
935993
; GFX6-NEXT: s_waitcnt vmcnt(0)
994+
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
995+
; GFX6-NEXT: s_waitcnt expcnt(0)
996+
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v1
997+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
936998
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
937999
; GFX6-NEXT: s_endpgm
9381000
;
@@ -1953,13 +2015,44 @@ entry:
19532015
define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addrspace(8) %inout) {
19542016
; GFX6-LABEL: sub_i32_varying_vdata:
19552017
; GFX6: ; %bb.0: ; %entry
1956-
; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xd
1957-
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
2018+
; GFX6-NEXT: s_mov_b64 s[2:3], exec
2019+
; GFX6-NEXT: s_mov_b32 s4, 0
2020+
; GFX6-NEXT: ; implicit-def: $vgpr1
2021+
; GFX6-NEXT: .LBB7_1: ; %ComputeLoop
2022+
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
2023+
; GFX6-NEXT: s_ff1_i32_b64 s5, s[2:3]
2024+
; GFX6-NEXT: s_mov_b32 m0, s5
2025+
; GFX6-NEXT: v_readlane_b32 s8, v0, s5
2026+
; GFX6-NEXT: v_writelane_b32 v1, s4, m0
2027+
; GFX6-NEXT: s_lshl_b64 s[6:7], 1, s5
2028+
; GFX6-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7]
2029+
; GFX6-NEXT: v_cmp_ne_u64_e64 s[6:7], s[2:3], 0
2030+
; GFX6-NEXT: s_and_b64 vcc, exec, s[6:7]
2031+
; GFX6-NEXT: s_add_i32 s4, s4, s8
2032+
; GFX6-NEXT: s_cbranch_vccnz .LBB7_1
2033+
; GFX6-NEXT: ; %bb.2: ; %ComputeEnd
2034+
; GFX6-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
2035+
; GFX6-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
2036+
; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
2037+
; GFX6-NEXT: ; implicit-def: $vgpr0
2038+
; GFX6-NEXT: s_and_saveexec_b64 s[2:3], vcc
2039+
; GFX6-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
2040+
; GFX6-NEXT: s_cbranch_execz .LBB7_4
2041+
; GFX6-NEXT: ; %bb.3:
2042+
; GFX6-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xd
2043+
; GFX6-NEXT: v_mov_b32_e32 v0, s4
19582044
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
1959-
; GFX6-NEXT: buffer_atomic_sub v0, off, s[4:7], 0 glc
2045+
; GFX6-NEXT: buffer_atomic_sub v0, off, s[8:11], 0 glc
2046+
; GFX6-NEXT: .LBB7_4:
2047+
; GFX6-NEXT: s_or_b64 exec, exec, s[2:3]
2048+
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
19602049
; GFX6-NEXT: s_mov_b32 s3, 0xf000
19612050
; GFX6-NEXT: s_mov_b32 s2, -1
19622051
; GFX6-NEXT: s_waitcnt vmcnt(0)
2052+
; GFX6-NEXT: v_readfirstlane_b32 s4, v0
2053+
; GFX6-NEXT: s_waitcnt expcnt(0)
2054+
; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s4, v1
2055+
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
19632056
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
19642057
; GFX6-NEXT: s_endpgm
19652058
;

0 commit comments

Comments
 (0)