Skip to content

Commit cf6565f

Browse files
committed
[AMDGPU] Enable multi-dword flat scratch load/stores
Differential Revision: https://reviews.llvm.org/D91384
1 parent 4726a40 commit cf6565f

10 files changed

+268
-414
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1288,7 +1288,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
12881288

12891289
AMD_HSA_BITS_SET(Out.code_properties,
12901290
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
1291-
getElementByteSizeValue(STM.getMaxPrivateElementSize()));
1291+
getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
12921292

12931293
if (MFI->hasPrivateSegmentBuffer()) {
12941294
Out.code_properties |=

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -493,8 +493,8 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
493493
return LDSBankCount;
494494
}
495495

496-
unsigned getMaxPrivateElementSize() const {
497-
return MaxPrivateElementSize;
496+
unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
497+
return (ForBufferRSrc || !enableFlatScratch()) ? MaxPrivateElementSize : 16;
498498
}
499499

500500
unsigned getConstantBusLimit(unsigned Opcode) const;

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8604,7 +8604,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
86048604
return SplitVectorStore(Op, DAG);
86058605
return SDValue();
86068606
case 16:
8607-
if (NumElements > 4 || NumElements == 3)
8607+
if (NumElements > 4 ||
8608+
(NumElements == 3 && !Subtarget->enableFlatScratch()))
86088609
return SplitVectorStore(Op, DAG);
86098610
return SDValue();
86108611
default:

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6569,7 +6569,7 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
65696569

65706570
// GFX9 doesn't have ELEMENT_SIZE.
65716571
if (ST.getGeneration() <= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
6572-
uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
6572+
uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize(true)) - 1;
65736573
Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
65746574
}
65756575

llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Lines changed: 226 additions & 342 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/local-stack-alloc-block-sp-reference.ll

Lines changed: 8 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -71,19 +71,14 @@ define amdgpu_kernel void @local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8
7171
; FLATSCR-NEXT: scratch_store_byte off, v0, s3
7272
; FLATSCR-NEXT: s_cbranch_scc1 BB0_1
7373
; FLATSCR-NEXT: ; %bb.2: ; %split
74-
; FLATSCR-NEXT: s_movk_i32 s2, 0x20d0
75-
; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2
76-
; FLATSCR-NEXT: scratch_load_dword v1, off, s2 offset:4
7774
; FLATSCR-NEXT: s_movk_i32 s2, 0x2000
7875
; FLATSCR-NEXT: s_add_u32 s2, 0x3000, s2
79-
; FLATSCR-NEXT: scratch_load_dword v0, off, s2 offset:208
80-
; FLATSCR-NEXT: s_movk_i32 s2, 0x3000
81-
; FLATSCR-NEXT: scratch_load_dword v2, off, s2 offset:68
76+
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, s2 offset:208
8277
; FLATSCR-NEXT: s_movk_i32 s2, 0x3000
83-
; FLATSCR-NEXT: scratch_load_dword v3, off, s2 offset:64
78+
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s2 offset:64
8479
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
85-
; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v3
86-
; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
80+
; FLATSCR-NEXT: v_add_co_u32_e32 v0, vcc, v0, v2
81+
; FLATSCR-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
8782
; FLATSCR-NEXT: s_waitcnt lgkmcnt(0)
8883
; FLATSCR-NEXT: v_mov_b32_e32 v3, s1
8984
; FLATSCR-NEXT: v_mov_b32_e32 v2, s0
@@ -162,23 +157,17 @@ define void @func_local_stack_offset_uses_sp(i64 addrspace(1)* %out, i8 addrspac
162157
; FLATSCR-NEXT: scratch_store_byte off, v2, s1
163158
; FLATSCR-NEXT: s_cbranch_scc1 BB1_1
164159
; FLATSCR-NEXT: ; %bb.2: ; %split
165-
; FLATSCR-NEXT: s_movk_i32 s0, 0x20d0
166-
; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000
167-
; FLATSCR-NEXT: s_add_u32 s0, s1, s0
168-
; FLATSCR-NEXT: scratch_load_dword v3, off, s0 offset:4
169160
; FLATSCR-NEXT: s_movk_i32 s0, 0x2000
170161
; FLATSCR-NEXT: s_add_u32 s1, s33, 0x1000
171162
; FLATSCR-NEXT: s_add_u32 s0, s1, s0
172-
; FLATSCR-NEXT: scratch_load_dword v2, off, s0 offset:208
173-
; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000
174-
; FLATSCR-NEXT: scratch_load_dword v4, off, s0 offset:68
163+
; FLATSCR-NEXT: scratch_load_dwordx2 v[2:3], off, s0 offset:208
175164
; FLATSCR-NEXT: s_add_u32 s0, s33, 0x1000
176-
; FLATSCR-NEXT: scratch_load_dword v5, off, s0 offset:64
165+
; FLATSCR-NEXT: scratch_load_dwordx2 v[4:5], off, s0 offset:64
177166
; FLATSCR-NEXT: s_sub_u32 s32, s32, 0x6000
178167
; FLATSCR-NEXT: s_mov_b32 s33, s2
179168
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
180-
; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v5
181-
; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
169+
; FLATSCR-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
170+
; FLATSCR-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
182171
; FLATSCR-NEXT: global_store_dwordx2 v[0:1], v[2:3], off
183172
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
184173
; FLATSCR-NEXT: s_setpc_b64 s[30:31]

llvm/test/CodeGen/AMDGPU/memcpy-fixed-align.ll

Lines changed: 7 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,24 +31,15 @@ define void @memcpy_fixed_align(i8 addrspace(5)* %dst, i8 addrspace(1)* %src) {
3131
; FLATSCR-LABEL: memcpy_fixed_align:
3232
; FLATSCR: ; %bb.0:
3333
; FLATSCR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
34-
; FLATSCR-NEXT: global_load_dword v0, v[1:2], off offset:36
35-
; FLATSCR-NEXT: global_load_dword v11, v[1:2], off offset:32
34+
; FLATSCR-NEXT: global_load_dwordx2 v[11:12], v[1:2], off offset:32
3635
; FLATSCR-NEXT: global_load_dwordx4 v[3:6], v[1:2], off offset:16
3736
; FLATSCR-NEXT: global_load_dwordx4 v[7:10], v[1:2], off
38-
; FLATSCR-NEXT: s_waitcnt vmcnt(3)
39-
; FLATSCR-NEXT: scratch_store_dword off, v0, s32 offset:36
40-
; FLATSCR-NEXT: s_waitcnt vmcnt(3)
41-
; FLATSCR-NEXT: scratch_store_dword off, v11, s32 offset:32
42-
; FLATSCR-NEXT: s_waitcnt vmcnt(3)
43-
; FLATSCR-NEXT: scratch_store_dword off, v6, s32 offset:28
44-
; FLATSCR-NEXT: scratch_store_dword off, v5, s32 offset:24
45-
; FLATSCR-NEXT: scratch_store_dword off, v4, s32 offset:20
46-
; FLATSCR-NEXT: scratch_store_dword off, v3, s32 offset:16
47-
; FLATSCR-NEXT: s_waitcnt vmcnt(6)
48-
; FLATSCR-NEXT: scratch_store_dword off, v10, s32 offset:12
49-
; FLATSCR-NEXT: scratch_store_dword off, v9, s32 offset:8
50-
; FLATSCR-NEXT: scratch_store_dword off, v8, s32 offset:4
51-
; FLATSCR-NEXT: scratch_store_dword off, v7, s32
37+
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
38+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[11:12], s32 offset:32
39+
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
40+
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[3:6], s32 offset:16
41+
; FLATSCR-NEXT: s_waitcnt vmcnt(2)
42+
; FLATSCR-NEXT: scratch_store_dwordx4 off, v[7:10], s32
5243
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
5344
; FLATSCR-NEXT: s_setpc_b64 s[30:31]
5445
%alloca = alloca [40 x i8], addrspace(5)

llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -69,13 +69,12 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
6969
; FLATSCR-NEXT: s_mov_b32 s2, s32
7070
; FLATSCR-NEXT: s_movk_i32 s3, 0x1000
7171
; FLATSCR-NEXT: s_add_i32 s4, s2, s3
72-
; FLATSCR-NEXT: s_add_u32 s2, s2, s3
7372
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
74-
; FLATSCR-NEXT: scratch_store_dword off, v1, s2
75-
; FLATSCR-NEXT: v_mov_b32_e32 v1, 1
73+
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
74+
; FLATSCR-NEXT: s_add_u32 s2, s2, s3
75+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
7676
; FLATSCR-NEXT: s_lshl_b32 s2, s6, 2
7777
; FLATSCR-NEXT: s_mov_b32 s32, s4
78-
; FLATSCR-NEXT: scratch_store_dword off, v1, s4 offset:4
7978
; FLATSCR-NEXT: s_add_i32 s4, s4, s2
8079
; FLATSCR-NEXT: scratch_load_dword v1, off, s4
8180
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
@@ -174,11 +173,10 @@ define amdgpu_kernel void @kernel_non_entry_block_static_alloca_uniformly_reache
174173
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
175174
; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
176175
; FLATSCR-NEXT: v_mov_b32_e32 v1, 0
177-
; FLATSCR-NEXT: scratch_store_dword off, v1, s2
178-
; FLATSCR-NEXT: v_mov_b32_e32 v1, 1
176+
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
179177
; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2
180178
; FLATSCR-NEXT: s_mov_b32 s32, s2
181-
; FLATSCR-NEXT: scratch_store_dword off, v1, s2 offset:4
179+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2
182180
; FLATSCR-NEXT: s_add_i32 s2, s2, s3
183181
; FLATSCR-NEXT: scratch_load_dword v1, off, s2
184182
; FLATSCR-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
@@ -275,11 +273,10 @@ define void @func_non_entry_block_static_alloca_align4(i32 addrspace(1)* %out, i
275273
; FLATSCR-NEXT: s_mov_b32 s2, s32
276274
; FLATSCR-NEXT: s_movk_i32 s3, 0x1000
277275
; FLATSCR-NEXT: s_add_i32 s4, s2, s3
278-
; FLATSCR-NEXT: s_add_u32 s2, s2, s3
279276
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
280-
; FLATSCR-NEXT: scratch_store_dword off, v2, s2
281-
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
282-
; FLATSCR-NEXT: scratch_store_dword off, v2, s4 offset:4
277+
; FLATSCR-NEXT: v_mov_b32_e32 v3, 1
278+
; FLATSCR-NEXT: s_add_u32 s2, s2, s3
279+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2
283280
; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s4
284281
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
285282
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v5
@@ -371,11 +368,10 @@ define void @func_non_entry_block_static_alloca_align64(i32 addrspace(1)* %out,
371368
; FLATSCR-NEXT: ; %bb.1: ; %bb.0
372369
; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000
373370
; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000
374-
; FLATSCR-NEXT: v_mov_b32_e32 v2, 0
375-
; FLATSCR-NEXT: scratch_store_dword off, v2, s2
376-
; FLATSCR-NEXT: v_mov_b32_e32 v2, 1
377-
; FLATSCR-NEXT: scratch_store_dword off, v2, s2 offset:4
371+
; FLATSCR-NEXT: v_mov_b32_e32 v5, 0
372+
; FLATSCR-NEXT: v_mov_b32_e32 v6, 1
378373
; FLATSCR-NEXT: v_lshl_add_u32 v2, v3, 2, s2
374+
; FLATSCR-NEXT: scratch_store_dwordx2 off, v[5:6], s2
379375
; FLATSCR-NEXT: scratch_load_dword v2, v2, off
380376
; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v4
381377
; FLATSCR-NEXT: s_mov_b32 s32, s2

llvm/test/CodeGen/AMDGPU/scratch-simple.ll

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@
1616
;
1717
; GCN-LABEL: {{^}}ps_main:
1818

19-
; GFX9-FLATSCR: s_add_u32 flat_scratch_lo, s0, s2
20-
; GFX9-FLATSCR: s_addc_u32 flat_scratch_hi, s1, 0
19+
; GFX9-FLATSCR-DAG: s_add_u32 flat_scratch_lo, s0, s2
20+
; GFX9-FLATSCR-DAG: s_addc_u32 flat_scratch_hi, s1, 0
21+
; GFX9-FLATSCR-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
2122

2223
; GFX10-FLATSCR: s_add_u32 s0, s0, s2
2324
; GFX10-FLATSCR: s_addc_u32 s1, s1, 0
@@ -36,15 +37,13 @@
3637
; FLATSCR-NOT: SCRATCH_RSRC_DWORD
3738

3839
; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
39-
; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset:
40-
; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
41-
; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset:
40+
; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SP]] offset:
4241

43-
; GFX10-FLATSCR: scratch_store_dword off, v2, off offset:
44-
; GFX10-FLATSCR: scratch_store_dword off, v2, off offset:
42+
; GFX10-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], off offset:
4543

46-
; GCN-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
47-
; GCN-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
44+
; MUBUF-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
45+
; MUBUF-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
46+
; GFX10-FLATSCR: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, v0
4847
; GCN-NOT: s_mov_b32 s0
4948

5049
; GCN-DAG: v_add{{_|_nc_}}{{i|u}}32_e32 [[HI_OFF:v[0-9]+]],{{.*}} 0x280, [[CLAMP_IDX]]
@@ -53,7 +52,6 @@
5352
; MUBUF: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
5453
; MUBUF: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, 0 offen
5554
; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[LO_OFF]], off
56-
; FLATSCR: scratch_load_dword {{v[0-9]+}}, [[HI_OFF]], off
5755
define amdgpu_ps float @ps_main(i32 %idx) {
5856
%v1 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0xBFEA477C60000000, float 0xBFEBE5DC60000000, float 0xBFEC71C720000000, float 0xBFEBE5DC60000000, float 0xBFEA477C60000000, float 0xBFE7A693C0000000, float 0xBFE41CFEA0000000, float 0x3FDF9B13E0000000, float 0x3FDF9B1380000000, float 0x3FD5C53B80000000, float 0x3FD5C53B00000000, float 0x3FC6326AC0000000, float 0x3FC63269E0000000, float 0xBEE05CEB00000000, float 0xBEE086A320000000, float 0xBFC63269E0000000, float 0xBFC6326AC0000000, float 0xBFD5C53B80000000, float 0xBFD5C53B80000000, float 0xBFDF9B13E0000000, float 0xBFDF9B1460000000, float 0xBFE41CFE80000000, float 0x3FE7A693C0000000, float 0x3FEA477C20000000, float 0x3FEBE5DC40000000, float 0x3FEC71C6E0000000, float 0x3FEBE5DC40000000, float 0x3FEA477C20000000, float 0x3FE7A693C0000000, float 0xBFE41CFE80000000>, i32 %idx
5957
%v2 = extractelement <81 x float> <float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float undef, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFEA0000000, float 0xBFE7A693C0000000, float 0x3FE7A693C0000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFEBE5DC40000000, float 0x3FEBE5DC40000000, float 0xBFEC71C720000000, float 0x3FEC71C6E0000000, float 0xBFEBE5DC60000000, float 0x3FEBE5DC40000000, float 0xBFEA477C20000000, float 0x3FEA477C20000000, float 0xBFE7A693C0000000, float 0x3FE7A69380000000, float 0xBFE41CFEA0000000, float 0xBFDF9B13E0000000, float 0xBFD5C53B80000000, float 0xBFC6326AC0000000, float 0x3EE0789320000000, float 0x3FC6326AC0000000, float 0x3FD5C53B80000000, float 0x3FDF9B13E0000000, float 0x3FE41CFE80000000>, i32 %idx
@@ -79,9 +77,7 @@ define amdgpu_ps float @ps_main(i32 %idx) {
7977
; MUBUF: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 offen
8078

8179
; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
82-
; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset:
83-
; GFX9-FLATSCR: s_mov_b32 [[SP:[^,]+]], 0
84-
; GFX9-FLATSCR: scratch_store_dword off, v2, [[SP]] offset:
80+
; GFX9-FLATSCR: scratch_store_dwordx4 off, v[{{[0-9:]+}}], [[SP]] offset:
8581

8682
; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off
8783
; FLATSCR: scratch_load_dword {{v[0-9]+}}, {{v[0-9]+}}, off

llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -78,10 +78,7 @@ define amdgpu_kernel void @kernel_background_evaluate(float addrspace(5)* %kg, <
7878
; FLATSCR-NEXT: ; %bb.1: ; %if.then4.i
7979
; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000
8080
; FLATSCR-NEXT: s_nop 1
81-
; FLATSCR-NEXT: scratch_load_dword v0, off, vcc_lo offset:4
82-
; FLATSCR-NEXT: s_waitcnt_depctr 0xffe3
83-
; FLATSCR-NEXT: s_movk_i32 vcc_lo, 0x4000
84-
; FLATSCR-NEXT: scratch_load_dword v1, off, vcc_lo offset:8
81+
; FLATSCR-NEXT: scratch_load_dwordx2 v[0:1], off, vcc_lo offset:4
8582
; FLATSCR-NEXT: s_waitcnt vmcnt(0)
8683
; FLATSCR-NEXT: v_add_nc_u32_e32 v0, v1, v0
8784
; FLATSCR-NEXT: v_mul_lo_u32 v0, 0x41c64e6d, v0

0 commit comments

Comments
 (0)