Skip to content

Commit 23db2d3

Browse files
committed
[AMDGPU] Better selection of base offset when merging DS reads/writes
When merging a pair of DS reads or writes needs to materialize the base offset in a vgpr, choose a value that is aligned to as high a power of two as possible. This maximises the chance that different pairs can use the same base offset, in which case the base offset registers can be commoned up by MachineCSE. Differential Revision: https://reviews.llvm.org/D96421
1 parent 5744502 commit 23db2d3

File tree

4 files changed

+91
-92
lines changed

4 files changed

+91
-92
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 34 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -725,6 +725,16 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
725725
return NewFormatInfo->Format;
726726
}
727727

728+
// Return the value in the inclusive range [Lo,Hi] that is aligned to the
729+
// highest power of two. Note that the result is well defined for all inputs
730+
// including corner cases like:
731+
// - if Lo == Hi, return that value
732+
// - if Lo == 0, return 0 (even though the "- 1" below underflows
733+
// - if Lo > Hi, return 0 (as if the range wrapped around)
734+
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
735+
return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
736+
}
737+
728738
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
729739
const GCNSubtarget &STI,
730740
CombineInfo &Paired,
@@ -764,8 +774,8 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
764774
return false;
765775
}
766776

767-
unsigned EltOffset0 = CI.Offset / CI.EltSize;
768-
unsigned EltOffset1 = Paired.Offset / CI.EltSize;
777+
uint32_t EltOffset0 = CI.Offset / CI.EltSize;
778+
uint32_t EltOffset1 = Paired.Offset / CI.EltSize;
769779
CI.UseST64 = false;
770780
CI.BaseOff = 0;
771781

@@ -799,22 +809,36 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
799809
}
800810

801811
// Try to shift base address to decrease offsets.
802-
unsigned OffsetDiff = std::abs((int)EltOffset1 - (int)EltOffset0);
803-
CI.BaseOff = std::min(CI.Offset, Paired.Offset);
812+
uint32_t Min = std::min(EltOffset0, EltOffset1);
813+
uint32_t Max = std::max(EltOffset0, EltOffset1);
804814

805-
if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
815+
const uint32_t Mask = maskTrailingOnes<uint32_t>(8) * 64;
816+
if (((Max - Min) & ~Mask) == 0) {
806817
if (Modify) {
807-
CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
808-
Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
818+
// From the range of values we could use for BaseOff, choose the one that
819+
// is aligned to the highest power of two, to maximise the chance that
820+
// the same offset can be reused for other load/store pairs.
821+
uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff * 64, Min);
822+
// Copy the low bits of the offsets, so that when we adjust them by
823+
// subtracting BaseOff they will be multiples of 64.
824+
BaseOff |= Min & maskTrailingOnes<uint32_t>(6);
825+
CI.BaseOff = BaseOff * CI.EltSize;
826+
CI.Offset = (EltOffset0 - BaseOff) / 64;
827+
Paired.Offset = (EltOffset1 - BaseOff) / 64;
809828
CI.UseST64 = true;
810829
}
811830
return true;
812831
}
813832

814-
if (isUInt<8>(OffsetDiff)) {
833+
if (isUInt<8>(Max - Min)) {
815834
if (Modify) {
816-
CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
817-
Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
835+
// From the range of values we could use for BaseOff, choose the one that
836+
// is aligned to the highest power of two, to maximise the chance that
837+
// the same offset can be reused for other load/store pairs.
838+
uint32_t BaseOff = mostAlignedValueInRange(Max - 0xff, Min);
839+
CI.BaseOff = BaseOff * CI.EltSize;
840+
CI.Offset = EltOffset0 - BaseOff;
841+
Paired.Offset = EltOffset1 - BaseOff;
818842
}
819843
return true;
820844
}

llvm/test/CodeGen/AMDGPU/ds-combine-large-stride.ll

Lines changed: 48 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
1010
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
1111

12-
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
13-
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
14-
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
12+
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x200, [[BASE]]
13+
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]]
14+
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x800, [[BASE]]
1515

1616
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
17-
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
18-
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
19-
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
17+
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:72 offset1:172
18+
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset0:144 offset1:244
19+
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset0:88 offset1:188
2020
define amdgpu_kernel void @ds_read32_combine_stride_400(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
2121
bb:
2222
%tmp = load float, float addrspace(3)* %arg, align 4
@@ -52,18 +52,14 @@ bb:
5252

5353
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
5454
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
55-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
56-
; VI-DAG: v_add_u32_e32 [[B4:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
5755

58-
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x640, [[BASE]]
59-
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x6e0, [[BASE]]
60-
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x780, [[BASE]]
61-
; GFX9-DAG: v_add_u32_e32 [[B4:v[0-9]+]], 0x820, [[BASE]]
56+
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x400, [[BASE]]
57+
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x800, [[BASE]]
6258

63-
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:20
64-
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:20
65-
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:20
66-
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B4]] offset1:20
59+
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:144 offset1:164
60+
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:184 offset1:204
61+
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:224 offset1:244
62+
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset0:8 offset1:28
6763
define amdgpu_kernel void @ds_read32_combine_stride_20(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
6864
bb:
6965
%tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 400
@@ -102,14 +98,14 @@ bb:
10298
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
10399
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
104100

105-
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
106-
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
107-
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
101+
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]]
102+
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]]
103+
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x200, [[BASE]]
108104

109105
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:100
110-
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:100
111-
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:100
112-
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:100
106+
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:88 offset1:188
107+
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset0:144 offset1:244
108+
; GCN-DAG: ds_read2_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset0:72 offset1:172
113109
define amdgpu_kernel void @ds_read32_combine_stride_400_back(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
114110
bb:
115111
%tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
@@ -180,16 +176,11 @@ bb:
180176
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
181177

182178
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
183-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
184-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
185-
186179
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
187-
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
188-
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
189180

190181
; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:32
191-
; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:32
192-
; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:32
182+
; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:64 offset1:96
183+
; GCN-DAG: ds_read2st64_b32 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:128 offset1:160
193184
define amdgpu_kernel void @ds_read32_combine_stride_8192_shifted(float addrspace(3)* nocapture readonly %arg, float *nocapture %arg1) {
194185
bb:
195186
%tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
@@ -219,12 +210,12 @@ bb:
219210
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
220211

221212
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
222-
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
213+
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]]
223214

224215
; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset1:50
225216
; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:100 offset1:150
226217
; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[BASE]] offset0:200 offset1:250
227-
; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:50
218+
; GCN-DAG: ds_read2_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:44 offset1:94
228219
define amdgpu_kernel void @ds_read64_combine_stride_400(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
229220
bb:
230221
%tmp = load double, double addrspace(3)* %arg, align 8
@@ -259,16 +250,11 @@ bb:
259250
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
260251

261252
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
262-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
263-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
264-
265253
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
266-
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
267-
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
268254

269255
; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset1:16
270-
; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B2]] offset1:16
271-
; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B3]] offset1:16
256+
; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:32 offset1:48
257+
; GCN-DAG: ds_read2st64_b64 v[{{[0-9]+:[0-9]+}}], [[B1]] offset0:64 offset1:80
272258
define amdgpu_kernel void @ds_read64_combine_stride_8192_shifted(double addrspace(3)* nocapture readonly %arg, double *nocapture %arg1) {
273259
bb:
274260
%tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1
@@ -301,14 +287,14 @@ bb:
301287
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
302288
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
303289

304-
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
305-
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
306-
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
290+
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x200, [[BASE]]
291+
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]]
292+
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x800, [[BASE]]
307293

308294
; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
309-
; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
310-
; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
311-
; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
295+
; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset0:72 offset1:172
296+
; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset0:144 offset1:244
297+
; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset0:88 offset1:188
312298
define amdgpu_kernel void @ds_write32_combine_stride_400(float addrspace(3)* nocapture %arg) {
313299
bb:
314300
store float 1.000000e+00, float addrspace(3)* %arg, align 4
@@ -337,14 +323,14 @@ bb:
337323
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
338324
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
339325

340-
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x320, [[BASE]]
341-
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x640, [[BASE]]
342-
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x960, [[BASE]]
326+
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]]
327+
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x400, [[BASE]]
328+
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x200, [[BASE]]
343329

330+
; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset0:88 offset1:188
331+
; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset0:144 offset1:244
332+
; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset0:72 offset1:172
344333
; GCN-DAG: ds_write2_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
345-
; GCN-DAG: ds_write2_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
346-
; GCN-DAG: ds_write2_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
347-
; GCN-DAG: ds_write2_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:100
348334
define amdgpu_kernel void @ds_write32_combine_stride_400_back(float addrspace(3)* nocapture %arg) {
349335
bb:
350336
%tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 700
@@ -396,17 +382,12 @@ bb:
396382
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
397383
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
398384

399-
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 4, [[BASE]]
400-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
401-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
402-
403-
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 4, [[BASE]]
404-
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4004, [[BASE]]
405-
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8004, [[BASE]]
385+
; VI-DAG: v_add_u32_e32 [[BASE:v[0-9]+]], vcc, 4, [[BASE]]
386+
; GFX9-DAG: v_add_u32_e32 [[BASE:v[0-9]+]], 4, [[BASE]]
406387

407-
; GCN-DAG: ds_write2st64_b32 [[B1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
408-
; GCN-DAG: ds_write2st64_b32 [[B2]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
409-
; GCN-DAG: ds_write2st64_b32 [[B3]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
388+
; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset1:32
389+
; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:64 offset1:96
390+
; GCN-DAG: ds_write2st64_b32 [[BASE]], v{{[0-9]+}}, v{{[0-9]+}} offset0:128 offset1:160
410391
define amdgpu_kernel void @ds_write32_combine_stride_8192_shifted(float addrspace(3)* nocapture %arg) {
411392
bb:
412393
%tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
@@ -429,12 +410,12 @@ bb:
429410
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
430411

431412
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
432-
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x960, [[BASE]]
413+
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 0x800, [[BASE]]
433414

434415
; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
435416
; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:100 offset1:150
436417
; GCN-DAG: ds_write2_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:200 offset1:250
437-
; GCN-DAG: ds_write2_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:50
418+
; GCN-DAG: ds_write2_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:44 offset1:94
438419
define amdgpu_kernel void @ds_write64_combine_stride_400(double addrspace(3)* nocapture %arg) {
439420
bb:
440421
store double 1.000000e+00, double addrspace(3)* %arg, align 8
@@ -459,17 +440,12 @@ bb:
459440
; GCN: s_load_dword [[ARG:s[0-9]+]], s[4:5], 0x0
460441
; GCN: v_mov_b32_e32 [[BASE:v[0-9]+]], [[ARG]]
461442

462-
; VI-DAG: v_add_u32_e32 [[B1:v[0-9]+]], vcc, 8, [[BASE]]
463-
; VI-DAG: v_add_u32_e32 [[B2:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
464-
; VI-DAG: v_add_u32_e32 [[B3:v[0-9]+]], vcc, {{s[0-9]+}}, [[BASE]]
465-
466-
; GFX9-DAG: v_add_u32_e32 [[B1:v[0-9]+]], 8, [[BASE]]
467-
; GFX9-DAG: v_add_u32_e32 [[B2:v[0-9]+]], 0x4008, [[BASE]]
468-
; GFX9-DAG: v_add_u32_e32 [[B3:v[0-9]+]], 0x8008, [[BASE]]
443+
; VI-DAG: v_add_u32_e32 [[BASE]], vcc, 8, [[BASE]]
444+
; GFX9-DAG: v_add_u32_e32 [[BASE]], 8, [[BASE]]
469445

470-
; GCN-DAG: ds_write2st64_b64 [[B1]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
471-
; GCN-DAG: ds_write2st64_b64 [[B2]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
472-
; GCN-DAG: ds_write2st64_b64 [[B3]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
446+
; GCN-DAG: ds_write2st64_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset1:16
447+
; GCN-DAG: ds_write2st64_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:32 offset1:48
448+
; GCN-DAG: ds_write2st64_b64 [[BASE]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] offset0:64 offset1:80
473449
define amdgpu_kernel void @ds_write64_combine_stride_8192_shifted(double addrspace(3)* nocapture %arg) {
474450
bb:
475451
%tmp = getelementptr inbounds double, double addrspace(3)* %arg, i32 1

llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,11 @@ define amdgpu_kernel void @same_address_fence_merge_write2() #0 {
1414
; GCN-NEXT: s_mov_b32 s1, 0x40100000
1515
; GCN-NEXT: v_mov_b32_e32 v0, s0
1616
; GCN-NEXT: v_mov_b32_e32 v1, s1
17-
; GCN-NEXT: v_add_u32_e32 v3, 0x840, v2
18-
; GCN-NEXT: v_add_u32_e32 v4, 0xc60, v2
17+
; GCN-NEXT: v_add_u32_e32 v3, 0x800, v2
1918
; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
2019
; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
21-
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset1:66
22-
; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[0:1] offset1:66
20+
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
21+
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206
2322
; GCN-NEXT: s_mov_b32 s1, 0x3ff00000
2423
; GCN-NEXT: v_mov_b32_e32 v0, s0
2524
; GCN-NEXT: v_mov_b32_e32 v1, s1
@@ -28,8 +27,8 @@ define amdgpu_kernel void @same_address_fence_merge_write2() #0 {
2827
; GCN-NEXT: s_waitcnt lgkmcnt(0)
2928
; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
3029
; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
31-
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset1:66
32-
; GCN-NEXT: ds_write2_b64 v4, v[0:1], v[0:1] offset1:66
30+
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
31+
; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206
3332
; GCN-NEXT: s_endpgm
3433
bb:
3534
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0

llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,12 @@
1111
# VI: V_ADD_CO_U32_e64 %6, %0,
1212
# VI-NEXT: DS_WRITE2_B32 killed %7, %0, %3, 0, 8,
1313
# VI: V_ADD_CO_U32_e64 %10, %3,
14-
# VI-NEXT: DS_READ2_B32 killed %11, 0, 8,
14+
# VI-NEXT: DS_READ2_B32 killed %11, 16, 24,
1515

1616
# GFX9: V_ADD_U32_e64 %6, %0,
1717
# GFX9-NEXT: DS_WRITE2_B32_gfx9 killed %7, %0, %3, 0, 8,
1818
# GFX9: V_ADD_U32_e64 %9, %3,
19-
# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 0, 8,
19+
# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 16, 24,
2020

2121
--- |
2222
@0 = internal unnamed_addr addrspace(3) global [256 x float] undef, align 4
@@ -94,12 +94,12 @@ body: |
9494
# VI: V_ADD_CO_U32_e64 %6, %0.sub0,
9595
# VI-NEXT: DS_WRITE2_B32 killed %7, %0.sub0, %3.sub0, 0, 8,
9696
# VI: V_ADD_CO_U32_e64 %10, %3.sub0,
97-
# VI-NEXT: DS_READ2_B32 killed %11, 0, 8,
97+
# VI-NEXT: DS_READ2_B32 killed %11, 16, 24,
9898

9999
# GFX9: V_ADD_U32_e64 %6, %0.sub0,
100100
# GFX9-NEXT: DS_WRITE2_B32_gfx9 killed %7, %0.sub0, %3.sub0, 0, 8,
101101
# GFX9: V_ADD_U32_e64 %9, %3.sub0,
102-
# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 0, 8,
102+
# GFX9-NEXT: DS_READ2_B32_gfx9 killed %10, 16, 24,
103103
---
104104
name: ds_combine_base_offset_subreg
105105
body: |

0 commit comments

Comments
 (0)