Skip to content

Commit 359a792

Browse files
committed
[AMDGPU] SILoadStoreOptimizer: avoid unbounded register pressure increases
Previously when combining two loads this pass would sink the first one down to the second one, putting the combined load where the second one was. It would also sink any intervening instructions which depended on the first load down to just after the combined load. For example, if we started with this sequence of instructions (code flowing from left to right): X A B C D E F Y After combining loads X and Y into XY we might end up with: A B C D E F XY But if B D and F depended on X, we would get: A C E XY B D F Now if the original code had some short disjoint live ranges from A to B, C to D and E to F, in the transformed code these live ranges will be long and overlapping. In this way a single merge of two loads could cause an unbounded increase in register pressure. To fix this, change the way the way that loads are moved in order to merge them so that: - The second load is moved up to the first one. (But when merging stores, we still move the first store down to the second one.) - Intervening instructions are never moved. - Instead, if we find an intervening instruction that would need to be moved, give up on the merge. But this case should now be pretty rare because normal stores have no outputs, and normal loads only have address register inputs, but these will be identical for any pair of loads that we try to merge. As well as fixing the unbounded register pressure increase problem, moving loads up and stores down seems like it should usually be a win for memory latency reasons. Differential Revision: https://reviews.llvm.org/D119006
1 parent 18bfc57 commit 359a792

File tree

8 files changed

+151
-225
lines changed

8 files changed

+151
-225
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 120 additions & 203 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,13 +66,15 @@ define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrsp
6666
}
6767

6868

69-
; The second load depends on the store. We can combine the two loads, and the combined load is
70-
; at the original place of the second load.
69+
; The second load depends on the store. We could combine the two loads, putting
70+
; the combined load at the original place of the second load, but we prefer to
71+
; leave the first load near the start of the function to hide its latency.
7172

7273
; GCN-LABEL: {{^}}ds_combine_RAW
7374

7475
; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
75-
; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26
76+
; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
77+
; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
7678
define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
7779

7880
%base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*

llvm/test/CodeGen/AMDGPU/ds_read2.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1244,28 +1244,28 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving(
12441244
; CI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x2
12451245
; CI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
12461246
; CI-NEXT: v_lshlrev_b32_e32 v1, 4, v1
1247-
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
1247+
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
12481248
; CI-NEXT: s_mov_b32 m0, -1
12491249
; CI-NEXT: s_waitcnt lgkmcnt(0)
12501250
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v1
1251-
; CI-NEXT: v_add_i32_e32 v3, vcc, s5, v4
1252-
; CI-NEXT: v_add_i32_e32 v5, vcc, s6, v1
1251+
; CI-NEXT: v_add_i32_e32 v3, vcc, s5, v0
1252+
; CI-NEXT: v_add_i32_e32 v4, vcc, s6, v1
1253+
; CI-NEXT: v_add_i32_e32 v6, vcc, s7, v0
12531254
; CI-NEXT: ds_read2_b32 v[0:1], v2 offset1:1
12541255
; CI-NEXT: ds_read2_b32 v[2:3], v3 offset1:4
1255-
; CI-NEXT: v_add_i32_e32 v6, vcc, s7, v4
1256-
; CI-NEXT: ds_read2_b32 v[4:5], v5 offset1:1
1256+
; CI-NEXT: ds_read2_b32 v[4:5], v4 offset1:1
12571257
; CI-NEXT: ds_read2_b32 v[6:7], v6 offset1:4
12581258
; CI-NEXT: s_mov_b32 s3, 0xf000
1259+
; CI-NEXT: s_mov_b32 s2, -1
12591260
; CI-NEXT: s_waitcnt lgkmcnt(2)
12601261
; CI-NEXT: v_mul_f32_e32 v0, v0, v2
12611262
; CI-NEXT: v_add_f32_e32 v0, 2.0, v0
1262-
; CI-NEXT: v_mul_f32_e32 v1, v1, v3
12631263
; CI-NEXT: s_waitcnt lgkmcnt(0)
12641264
; CI-NEXT: v_mul_f32_e32 v2, v4, v6
12651265
; CI-NEXT: v_sub_f32_e32 v0, v0, v2
1266+
; CI-NEXT: v_mul_f32_e32 v1, v1, v3
12661267
; CI-NEXT: v_sub_f32_e32 v0, v0, v1
12671268
; CI-NEXT: v_mul_f32_e32 v1, v5, v7
1268-
; CI-NEXT: s_mov_b32 s2, -1
12691269
; CI-NEXT: v_sub_f32_e32 v0, v0, v1
12701270
; CI-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:40
12711271
; CI-NEXT: s_endpgm

llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,15 @@
66
@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
77
@c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
88

9+
; FIXME: Should combine the DS instructions into ds_write2 and ds_read2. This
10+
; does not happen because when SILoadStoreOptimizer is run, the reads and writes
11+
; are not adjacent. They are only moved later by MachineScheduler.
12+
913
; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2:
10-
; GCN: ds_write2st64_b32
11-
; GCN: ds_read2st64_b32
14+
; GCN: ds_write_b32
15+
; GCN: ds_write_b32
16+
; GCN: ds_read_b32
17+
; GCN: ds_read_b32
1218

1319
; CHECK-LABEL: @no_clobber_ds_load_stores_x2
1420
; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3
@@ -30,9 +36,11 @@ bb:
3036
}
3137

3238
; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3:
33-
; GCN-DAG: ds_write2st64_b32
3439
; GCN-DAG: ds_write_b32
35-
; GCN-DAG: ds_read2st64_b32
40+
; GCN-DAG: ds_write_b32
41+
; GCN-DAG: ds_write_b32
42+
; GCN-DAG: ds_read_b32
43+
; GCN-DAG: ds_read_b32
3644
; GCN-DAG: ds_read_b32
3745

3846
; CHECK-LABEL: @no_clobber_ds_load_stores_x3

llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,9 @@
77
# However, an equivalent situation can occur with buffer instructions as well.
88

99
# CHECK-LABEL: name: scc_def_and_use_no_dependency
10+
# CHECK: DS_READ2_B32
1011
# CHECK: S_ADD_U32
1112
# CHECK: S_ADDC_U32
12-
# CHECK: DS_READ2_B32
1313
---
1414
name: scc_def_and_use_no_dependency
1515
machineFunctionInfo:

llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,8 @@
22

33
# GCN-LABEL: name: out_of_order_merge
44
# GCN: DS_READ2_B64_gfx9
5-
# GCN: DS_WRITE_B64_gfx9
65
# GCN: DS_READ2_B64_gfx9
7-
# GCN: DS_WRITE_B64_gfx9
6+
# GCN: DS_WRITE2_B64_gfx9
87
# GCN: DS_WRITE_B64_gfx9
98
---
109
name: out_of_order_merge

llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -780,8 +780,8 @@ body: |
780780

781781

782782
# GFX9-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
783-
# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 116, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
784-
# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
783+
# GFX9-DAG: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 116, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
784+
# GFX9-DAG: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
785785
name: gfx9_tbuffer_load_merge_across_swizzle
786786
body: |
787787
bb.0.entry:
@@ -1597,8 +1597,8 @@ body: |
15971597

15981598

15991599
# GFX10-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
1600-
# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 22, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
1601-
# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
1600+
# GFX10-DAG: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 22, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
1601+
# GFX10-DAG: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
16021602
name: gfx10_tbuffer_load_merge_across_swizzle
16031603
body: |
16041604
bb.0.entry:

llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
1010
; CI: buffer_store_dword
1111

12-
; GFX9: global_store_dword
1312
; GFX9: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
1413
; GFX9: global_store_dword
14+
; GFX9: global_store_dword
1515
define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
1616
%ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
1717

0 commit comments

Comments
 (0)