Skip to content

Commit 26e0864

Browse files
committed
take the alignment into consideration.
1 parent 09556a9 commit 26e0864

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+1955
-1969
lines changed

llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp

Lines changed: 23 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1725,36 +1725,30 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
17251725
case 8:
17261726
return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
17271727
}
1728-
case S_LOAD_IMM:
1729-
// Use the constrained opcodes when the subtarget has the XNACK support
1730-
// enabled.
1731-
if (STM->isXNACKEnabled()) {
1732-
switch (Width) {
1733-
default:
1734-
return 0;
1735-
case 2:
1736-
return AMDGPU::S_LOAD_DWORDX2_IMM_ec;
1737-
case 3:
1738-
return AMDGPU::S_LOAD_DWORDX3_IMM_ec;
1739-
case 4:
1740-
return AMDGPU::S_LOAD_DWORDX4_IMM_ec;
1741-
case 8:
1742-
return AMDGPU::S_LOAD_DWORDX8_IMM_ec;
1743-
}
1744-
} else {
1745-
switch (Width) {
1746-
default:
1747-
return 0;
1748-
case 2:
1749-
return AMDGPU::S_LOAD_DWORDX2_IMM;
1750-
case 3:
1751-
return AMDGPU::S_LOAD_DWORDX3_IMM;
1752-
case 4:
1753-
return AMDGPU::S_LOAD_DWORDX4_IMM;
1754-
case 8:
1755-
return AMDGPU::S_LOAD_DWORDX8_IMM;
1756-
}
1728+
case S_LOAD_IMM: {
1729+
// If XNACK is enabled, use the constrained opcodes when the first load is
1730+
// under-aligned.
1731+
const MachineMemOperand *MMO = *CI.I->memoperands_begin();
1732+
auto NeedsConstrainedOpc = [&MMO, Width](const GCNSubtarget &ST) {
1733+
return ST.isXNACKEnabled() && MMO->getAlign().value() < Width;
1734+
};
1735+
switch (Width) {
1736+
default:
1737+
return 0;
1738+
case 2:
1739+
return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX2_IMM_ec
1740+
: AMDGPU::S_LOAD_DWORDX2_IMM;
1741+
case 3:
1742+
return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX3_IMM_ec
1743+
: AMDGPU::S_LOAD_DWORDX3_IMM;
1744+
case 4:
1745+
return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX4_IMM_ec
1746+
: AMDGPU::S_LOAD_DWORDX4_IMM;
1747+
case 8:
1748+
return NeedsConstrainedOpc(*STM) ? AMDGPU::S_LOAD_DWORDX8_IMM_ec
1749+
: AMDGPU::S_LOAD_DWORDX8_IMM;
17571750
}
1751+
}
17581752
case GLOBAL_LOAD:
17591753
switch (Width) {
17601754
default:

llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -159,10 +159,10 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
159159
define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr, <2 x half> %data) {
160160
; GFX940-LABEL: local_atomic_fadd_v2f16_noret:
161161
; GFX940: ; %bb.0:
162-
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
162+
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
163163
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
164-
; GFX940-NEXT: v_mov_b32_e32 v0, s2
165-
; GFX940-NEXT: v_mov_b32_e32 v1, s3
164+
; GFX940-NEXT: v_mov_b32_e32 v0, s0
165+
; GFX940-NEXT: v_mov_b32_e32 v1, s1
166166
; GFX940-NEXT: ds_pk_add_f16 v0, v1
167167
; GFX940-NEXT: s_endpgm
168168
%ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
@@ -183,10 +183,10 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
183183
define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr, <2 x i16> %data) {
184184
; GFX940-LABEL: local_atomic_fadd_v2bf16_noret:
185185
; GFX940: ; %bb.0:
186-
; GFX940-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
186+
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
187187
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
188-
; GFX940-NEXT: v_mov_b32_e32 v0, s3
189-
; GFX940-NEXT: v_mov_b32_e32 v1, s2
188+
; GFX940-NEXT: v_mov_b32_e32 v0, s1
189+
; GFX940-NEXT: v_mov_b32_e32 v1, s0
190190
; GFX940-NEXT: buffer_wbl2 sc0 sc1
191191
; GFX940-NEXT: ds_pk_add_bf16 v1, v0
192192
; GFX940-NEXT: s_waitcnt lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,13 @@ define amdgpu_kernel void @dpp_test(ptr addrspace(1) %out, i32 %in1, i32 %in2) {
1919
;
2020
; GFX10-LABEL: dpp_test:
2121
; GFX10: ; %bb.0:
22-
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
22+
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
2323
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
24-
; GFX10-NEXT: v_mov_b32_e32 v0, s6
25-
; GFX10-NEXT: v_mov_b32_e32 v1, s7
24+
; GFX10-NEXT: v_mov_b32_e32 v0, s2
25+
; GFX10-NEXT: v_mov_b32_e32 v1, s3
2626
; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
2727
; GFX10-NEXT: v_mov_b32_e32 v1, 0
28-
; GFX10-NEXT: global_store_dword v1, v0, s[4:5]
28+
; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
2929
; GFX10-NEXT: s_endpgm
3030
;
3131
; GFX11-LABEL: dpp_test:
@@ -174,16 +174,16 @@ define amdgpu_kernel void @update_dppv2i32_test(ptr addrspace(1) %arg, <2 x i32>
174174
;
175175
; GFX10-LABEL: update_dppv2i32_test:
176176
; GFX10: ; %bb.0:
177-
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
177+
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
178178
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
179179
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
180-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
181-
; GFX10-NEXT: v_mov_b32_e32 v2, s6
182-
; GFX10-NEXT: v_mov_b32_e32 v3, s7
180+
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
181+
; GFX10-NEXT: v_mov_b32_e32 v2, s2
182+
; GFX10-NEXT: v_mov_b32_e32 v3, s3
183183
; GFX10-NEXT: s_waitcnt vmcnt(0)
184184
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
185185
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
186-
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
186+
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
187187
; GFX10-NEXT: s_endpgm
188188
;
189189
; GFX11-LABEL: update_dppv2i32_test:
@@ -229,16 +229,16 @@ define amdgpu_kernel void @update_dppv2f32_test(ptr addrspace(1) %arg, <2 x floa
229229
;
230230
; GFX10-LABEL: update_dppv2f32_test:
231231
; GFX10: ; %bb.0:
232-
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
232+
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
233233
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
234234
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
235-
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[4:5]
236-
; GFX10-NEXT: v_mov_b32_e32 v2, s6
237-
; GFX10-NEXT: v_mov_b32_e32 v3, s7
235+
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[0:1]
236+
; GFX10-NEXT: v_mov_b32_e32 v2, s2
237+
; GFX10-NEXT: v_mov_b32_e32 v3, s3
238238
; GFX10-NEXT: s_waitcnt vmcnt(0)
239239
; GFX10-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
240240
; GFX10-NEXT: v_mov_b32_dpp v3, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
241-
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[4:5]
241+
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1]
242242
; GFX10-NEXT: s_endpgm
243243
;
244244
; GFX11-LABEL: update_dppv2f32_test:

llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll

Lines changed: 76 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -692,148 +692,148 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
692692
;
693693
; GFX9-LABEL: sdivrem_v2i32:
694694
; GFX9: ; %bb.0:
695-
; GFX9-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
695+
; GFX9-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x0
696696
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
697-
; GFX9-NEXT: s_ashr_i32 s0, s14, 31
698-
; GFX9-NEXT: s_add_i32 s1, s14, s0
699-
; GFX9-NEXT: s_xor_b32 s1, s1, s0
700-
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1
701-
; GFX9-NEXT: s_ashr_i32 s2, s15, 31
702-
; GFX9-NEXT: s_add_i32 s3, s15, s2
703-
; GFX9-NEXT: s_xor_b32 s3, s3, s2
697+
; GFX9-NEXT: s_ashr_i32 s8, s6, 31
698+
; GFX9-NEXT: s_add_i32 s6, s6, s8
699+
; GFX9-NEXT: s_xor_b32 s6, s6, s8
700+
; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
701+
; GFX9-NEXT: s_ashr_i32 s9, s7, 31
702+
; GFX9-NEXT: s_add_i32 s7, s7, s9
703+
; GFX9-NEXT: s_xor_b32 s7, s7, s9
704704
; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
705-
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
706-
; GFX9-NEXT: s_sub_i32 s6, 0, s1
707-
; GFX9-NEXT: s_ashr_i32 s4, s12, 31
705+
; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
706+
; GFX9-NEXT: s_sub_i32 s12, 0, s6
707+
; GFX9-NEXT: s_ashr_i32 s10, s4, 31
708708
; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
709709
; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
710710
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
711-
; GFX9-NEXT: s_sub_i32 s7, 0, s3
712-
; GFX9-NEXT: s_ashr_i32 s5, s13, 31
713-
; GFX9-NEXT: v_mul_lo_u32 v2, s6, v0
711+
; GFX9-NEXT: s_add_i32 s4, s4, s10
712+
; GFX9-NEXT: s_xor_b32 s4, s4, s10
713+
; GFX9-NEXT: v_mul_lo_u32 v2, s12, v0
714714
; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
715715
; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
716-
; GFX9-NEXT: s_add_i32 s6, s12, s4
716+
; GFX9-NEXT: s_sub_i32 s12, 0, s7
717717
; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
718-
; GFX9-NEXT: s_xor_b32 s6, s6, s4
719-
; GFX9-NEXT: v_mul_lo_u32 v3, s7, v1
720-
; GFX9-NEXT: s_add_i32 s7, s13, s5
718+
; GFX9-NEXT: s_ashr_i32 s11, s5, 31
719+
; GFX9-NEXT: v_mul_lo_u32 v3, s12, v1
720+
; GFX9-NEXT: s_add_i32 s5, s5, s11
721721
; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
722-
; GFX9-NEXT: v_mul_hi_u32 v0, s6, v0
722+
; GFX9-NEXT: v_mul_hi_u32 v0, s4, v0
723723
; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3
724-
; GFX9-NEXT: s_xor_b32 s7, s7, s5
725-
; GFX9-NEXT: s_xor_b32 s0, s4, s0
726-
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1
724+
; GFX9-NEXT: s_xor_b32 s5, s5, s11
725+
; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6
727726
; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
728727
; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
729-
; GFX9-NEXT: v_mul_hi_u32 v1, s7, v1
730-
; GFX9-NEXT: v_sub_u32_e32 v3, s6, v3
731-
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v3
728+
; GFX9-NEXT: v_mul_hi_u32 v1, s5, v1
729+
; GFX9-NEXT: v_sub_u32_e32 v3, s4, v3
730+
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v3
732731
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
733-
; GFX9-NEXT: v_subrev_u32_e32 v2, s1, v3
732+
; GFX9-NEXT: v_subrev_u32_e32 v2, s6, v3
734733
; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
735734
; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
736-
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1, v2
735+
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6, v2
737736
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
738-
; GFX9-NEXT: v_subrev_u32_e32 v3, s1, v2
737+
; GFX9-NEXT: v_subrev_u32_e32 v3, s6, v2
739738
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
740-
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3
739+
; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7
741740
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
742-
; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
743-
; GFX9-NEXT: v_subrev_u32_e32 v0, s0, v0
744-
; GFX9-NEXT: v_sub_u32_e32 v3, s7, v3
745-
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
741+
; GFX9-NEXT: s_xor_b32 s4, s10, s8
742+
; GFX9-NEXT: v_xor_b32_e32 v0, s4, v0
743+
; GFX9-NEXT: v_sub_u32_e32 v3, s5, v3
744+
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
746745
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
747-
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
746+
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3
748747
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
749748
; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
750-
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
749+
; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
750+
; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0
751751
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
752-
; GFX9-NEXT: v_subrev_u32_e32 v4, s3, v3
753-
; GFX9-NEXT: s_xor_b32 s0, s5, s2
752+
; GFX9-NEXT: v_subrev_u32_e32 v4, s7, v3
753+
; GFX9-NEXT: s_xor_b32 s4, s11, s9
754754
; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
755-
; GFX9-NEXT: v_xor_b32_e32 v1, s0, v1
756-
; GFX9-NEXT: v_xor_b32_e32 v2, s4, v2
757-
; GFX9-NEXT: v_subrev_u32_e32 v1, s0, v1
758-
; GFX9-NEXT: v_xor_b32_e32 v3, s5, v3
755+
; GFX9-NEXT: v_xor_b32_e32 v1, s4, v1
756+
; GFX9-NEXT: v_xor_b32_e32 v2, s10, v2
757+
; GFX9-NEXT: v_subrev_u32_e32 v1, s4, v1
758+
; GFX9-NEXT: v_xor_b32_e32 v3, s11, v3
759759
; GFX9-NEXT: v_mov_b32_e32 v4, 0
760-
; GFX9-NEXT: v_subrev_u32_e32 v2, s4, v2
761-
; GFX9-NEXT: v_subrev_u32_e32 v3, s5, v3
762-
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
763-
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
760+
; GFX9-NEXT: v_subrev_u32_e32 v2, s10, v2
761+
; GFX9-NEXT: v_subrev_u32_e32 v3, s11, v3
762+
; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
763+
; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3]
764764
; GFX9-NEXT: s_endpgm
765765
;
766766
; GFX10-LABEL: sdivrem_v2i32:
767767
; GFX10: ; %bb.0:
768-
; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
768+
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0
769769
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
770-
; GFX10-NEXT: s_ashr_i32 s1, s14, 31
771-
; GFX10-NEXT: s_ashr_i32 s2, s15, 31
772-
; GFX10-NEXT: s_add_i32 s0, s14, s1
773-
; GFX10-NEXT: s_add_i32 s3, s15, s2
774-
; GFX10-NEXT: s_xor_b32 s4, s0, s1
770+
; GFX10-NEXT: s_ashr_i32 s1, s10, 31
771+
; GFX10-NEXT: s_ashr_i32 s2, s11, 31
772+
; GFX10-NEXT: s_add_i32 s0, s10, s1
773+
; GFX10-NEXT: s_add_i32 s3, s11, s2
774+
; GFX10-NEXT: s_xor_b32 s10, s0, s1
775775
; GFX10-NEXT: s_xor_b32 s3, s3, s2
776-
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
776+
; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10
777777
; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3
778-
; GFX10-NEXT: s_sub_i32 s0, 0, s4
779-
; GFX10-NEXT: s_sub_i32 s5, 0, s3
780-
; GFX10-NEXT: s_ashr_i32 s6, s13, 31
778+
; GFX10-NEXT: s_sub_i32 s0, 0, s10
779+
; GFX10-NEXT: s_sub_i32 s11, 0, s3
780+
; GFX10-NEXT: s_ashr_i32 s12, s9, 31
781781
; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
782782
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
783-
; GFX10-NEXT: s_add_i32 s7, s13, s6
784-
; GFX10-NEXT: s_xor_b32 s7, s7, s6
785783
; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
786784
; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
787785
; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
788786
; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
789787
; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0
790-
; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1
791-
; GFX10-NEXT: s_ashr_i32 s5, s12, 31
792-
; GFX10-NEXT: s_add_i32 s0, s12, s5
793-
; GFX10-NEXT: s_xor_b32 s1, s5, s1
794-
; GFX10-NEXT: s_xor_b32 s0, s0, s5
788+
; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1
789+
; GFX10-NEXT: s_ashr_i32 s11, s8, 31
790+
; GFX10-NEXT: s_add_i32 s0, s8, s11
791+
; GFX10-NEXT: s_add_i32 s8, s9, s12
792+
; GFX10-NEXT: s_xor_b32 s0, s0, s11
793+
; GFX10-NEXT: s_xor_b32 s8, s8, s12
795794
; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
796795
; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
796+
; GFX10-NEXT: s_xor_b32 s1, s11, s1
797797
; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
798798
; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
799799
; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
800-
; GFX10-NEXT: v_mul_hi_u32 v1, s7, v1
801-
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s4
800+
; GFX10-NEXT: v_mul_hi_u32 v1, s8, v1
801+
; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10
802802
; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3
803803
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
804804
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
805805
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2
806-
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s7, v3
807-
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2
806+
; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8, v3
807+
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
808808
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
809-
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2
809+
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
810810
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
811811
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
812812
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
813813
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
814814
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
815815
; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
816816
; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
817-
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4, v2
817+
; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10, v2
818818
; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
819-
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4, v2
819+
; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10, v2
820820
; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
821821
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
822822
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
823823
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
824824
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
825-
; GFX10-NEXT: s_xor_b32 s0, s6, s2
825+
; GFX10-NEXT: s_xor_b32 s0, s12, s2
826826
; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
827827
; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
828-
; GFX10-NEXT: v_xor_b32_e32 v2, s5, v2
829-
; GFX10-NEXT: v_xor_b32_e32 v3, s6, v3
828+
; GFX10-NEXT: v_xor_b32_e32 v2, s11, v2
829+
; GFX10-NEXT: v_xor_b32_e32 v3, s12, v3
830830
; GFX10-NEXT: v_mov_b32_e32 v4, 0
831831
; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
832832
; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
833-
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5, v2
834-
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6, v3
835-
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9]
836-
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11]
833+
; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11, v2
834+
; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12, v3
835+
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
836+
; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7]
837837
; GFX10-NEXT: s_endpgm
838838
%div = sdiv <2 x i32> %x, %y
839839
store <2 x i32> %div, ptr addrspace(1) %out0

0 commit comments

Comments
 (0)