[AMDGPU] SILoadStoreOptimizer: avoid unbounded register pressure increases

jayfoad · jayfoad · commit 359a792f9b13 · 2022-02-21T10:51:14.000Z
Previously when combining two loads this pass would sink the first one down to the second one, putting the combined load where the second one was. It would also sink any intervening instructions which depended on the first load down to just after the combined load. For example, if we started with this sequence of instructions (code flowing from left to right): X A B C D E F Y After combining loads X and Y into XY we might end up with: A B C D E F XY But if B D and F depended on X, we would get: A C E XY B D F Now if the original code had some short disjoint live ranges from A to B, C to D and E to F, in the transformed code these live ranges will be long and overlapping. In this way a single merge of two loads could cause an unbounded increase in register pressure. To fix this, change the way the way that loads are moved in order to merge them so that: - The second load is moved up to the first one. (But when merging stores, we still move the first store down to the second one.) - Intervening instructions are never moved. - Instead, if we find an intervening instruction that would need to be moved, give up on the merge. But this case should now be pretty rare because normal stores have no outputs, and normal loads only have address register inputs, but these will be identical for any pair of loads that we try to merge. As well as fixing the unbounded register pressure increase problem, moving loads up and stores down seems like it should usually be a win for memory latency reasons. Differential Revision: https://reviews.llvm.org/D119006
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
diff --git a/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll b/llvm/test/CodeGen/AMDGPU/ds-combine-with-dependence.ll
@@ -66,13 +66,15 @@ define amdgpu_kernel void @ds_combine_WAR(float addrspace(1)* %out, float addrsp
 }
 
 
-; The second load depends on the store. We can combine the two loads, and the combined load is
-; at the original place of the second load.
+; The second load depends on the store. We could combine the two loads, putting
+; the combined load at the original place of the second load, but we prefer to
+; leave the first load near the start of the function to hide its latency.
 
 ; GCN-LABEL: {{^}}ds_combine_RAW
 
 ; GCN:      ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:26 offset1:27
-; GCN-NEXT: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:8 offset1:26
+; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
+; GCN-NEXT: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:104
 define amdgpu_kernel void @ds_combine_RAW(float addrspace(1)* %out, float addrspace(3)* %inptr) {
 
   %base = bitcast float addrspace(3)* %inptr to i8 addrspace(3)*
diff --git a/llvm/test/CodeGen/AMDGPU/ds_read2.ll b/llvm/test/CodeGen/AMDGPU/ds_read2.ll
@@ -1244,28 +1244,28 @@ define amdgpu_kernel void @ds_read_diff_base_interleaving(
 ; CI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x2
 ; CI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
 ; CI-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; CI-NEXT:    v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; CI-NEXT:    s_mov_b32 m0, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_add_i32_e32 v2, vcc, s4, v1
-; CI-NEXT:    v_add_i32_e32 v3, vcc, s5, v4
-; CI-NEXT:    v_add_i32_e32 v5, vcc, s6, v1
+; CI-NEXT:    v_add_i32_e32 v3, vcc, s5, v0
+; CI-NEXT:    v_add_i32_e32 v4, vcc, s6, v1
+; CI-NEXT:    v_add_i32_e32 v6, vcc, s7, v0
 ; CI-NEXT:    ds_read2_b32 v[0:1], v2 offset1:1
 ; CI-NEXT:    ds_read2_b32 v[2:3], v3 offset1:4
-; CI-NEXT:    v_add_i32_e32 v6, vcc, s7, v4
-; CI-NEXT:    ds_read2_b32 v[4:5], v5 offset1:1
+; CI-NEXT:    ds_read2_b32 v[4:5], v4 offset1:1
 ; CI-NEXT:    ds_read2_b32 v[6:7], v6 offset1:4
 ; CI-NEXT:    s_mov_b32 s3, 0xf000
+; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    s_waitcnt lgkmcnt(2)
 ; CI-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; CI-NEXT:    v_add_f32_e32 v0, 2.0, v0
-; CI-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; CI-NEXT:    s_waitcnt lgkmcnt(0)
 ; CI-NEXT:    v_mul_f32_e32 v2, v4, v6
 ; CI-NEXT:    v_sub_f32_e32 v0, v0, v2
+; CI-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; CI-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; CI-NEXT:    v_mul_f32_e32 v1, v5, v7
-; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; CI-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:40
 ; CI-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa.ll
@@ -6,9 +6,15 @@
 @b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 @c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
 
+; FIXME: Should combine the DS instructions into ds_write2 and ds_read2. This
+; does not happen because when SILoadStoreOptimizer is run, the reads and writes
+; are not adjacent. They are only moved later by MachineScheduler.
+
 ; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x2:
-; GCN: ds_write2st64_b32
-; GCN: ds_read2st64_b32
+; GCN: ds_write_b32
+; GCN: ds_write_b32
+; GCN: ds_read_b32
+; GCN: ds_read_b32
 
 ; CHECK-LABEL: @no_clobber_ds_load_stores_x2
 ; CHECK: store i32 1, i32 addrspace(3)* %0, align 16, !alias.scope !0, !noalias !3
@@ -30,9 +36,11 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}no_clobber_ds_load_stores_x3:
-; GCN-DAG: ds_write2st64_b32
 ; GCN-DAG: ds_write_b32
-; GCN-DAG: ds_read2st64_b32
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_write_b32
+; GCN-DAG: ds_read_b32
+; GCN-DAG: ds_read_b32
 ; GCN-DAG: ds_read_b32
 
 ; CHECK-LABEL: @no_clobber_ds_load_stores_x3
diff --git a/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir b/llvm/test/CodeGen/AMDGPU/merge-load-store-physreg.mir
@@ -7,9 +7,9 @@
 # However, an equivalent situation can occur with buffer instructions as well.
 
 # CHECK-LABEL: name: scc_def_and_use_no_dependency
+# CHECK: DS_READ2_B32
 # CHECK: S_ADD_U32
 # CHECK: S_ADDC_U32
-# CHECK: DS_READ2_B32
 ---
 name:            scc_def_and_use_no_dependency
 machineFunctionInfo:
diff --git a/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir b/llvm/test/CodeGen/AMDGPU/merge-out-of-order-ldst.mir
@@ -2,9 +2,8 @@
 
 # GCN-LABEL: name: out_of_order_merge
 # GCN: DS_READ2_B64_gfx9
-# GCN: DS_WRITE_B64_gfx9
 # GCN: DS_READ2_B64_gfx9
-# GCN: DS_WRITE_B64_gfx9
+# GCN: DS_WRITE2_B64_gfx9
 # GCN: DS_WRITE_B64_gfx9
 ---
 name:            out_of_order_merge
diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
@@ -780,8 +780,8 @@ body:             |
 
 
 # GFX9-LABEL: name: gfx9_tbuffer_load_merge_across_swizzle
-# GFX9: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 116, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-# GFX9: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+# GFX9-DAG: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 116, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+# GFX9-DAG: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 123, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
 name: gfx9_tbuffer_load_merge_across_swizzle
 body:             |
   bb.0.entry:
@@ -1597,8 +1597,8 @@ body:             |
 
 
 # GFX10-LABEL: name: gfx10_tbuffer_load_merge_across_swizzle
-# GFX10: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 22, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
-# GFX10: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+# GFX10-DAG: %{{[0-9]+}}:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET %4, 0, 12, 22, 0, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+# GFX10-DAG: %{{[0-9]+}}:vreg_64 = TBUFFER_LOAD_FORMAT_XY_OFFSET %4, 0, 4, 64, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
 name: gfx10_tbuffer_load_merge_across_swizzle
 body:             |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -9,9 +9,9 @@
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; CI: buffer_store_dword
 
-; GFX9: global_store_dword
 ; GFX9: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; GFX9: global_store_dword
+; GFX9: global_store_dword
 define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4