Skip to content

Commit 14943b9

Browse files
author
git apple-llvm automerger
committed
Merge commit '4d4dc1edb9c3' from apple/main into swift/next
2 parents 53506cb + 4d4dc1e commit 14943b9

File tree

4 files changed

+293
-198
lines changed

4 files changed

+293
-198
lines changed

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1977,6 +1977,22 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
19771977
return true;
19781978
}
19791979

1980+
// Insert a cross regbank copy for a register if it already has a bank that
1981+
// differs from the one we want to set.
1982+
static Register constrainRegToBank(MachineRegisterInfo &MRI,
1983+
MachineIRBuilder &B, Register &Reg,
1984+
const RegisterBank &Bank) {
1985+
const RegisterBank *CurrBank = MRI.getRegBankOrNull(Reg);
1986+
if (CurrBank && *CurrBank != Bank) {
1987+
Register Copy = B.buildCopy(MRI.getType(Reg), Reg).getReg(0);
1988+
MRI.setRegBank(Copy, Bank);
1989+
return Copy;
1990+
}
1991+
1992+
MRI.setRegBank(Reg, Bank);
1993+
return Reg;
1994+
}
1995+
19801996
bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
19811997
MachineInstr &MI, MachineRegisterInfo &MRI,
19821998
const OperandsMapper &OpdMapper) const {
@@ -2040,13 +2056,14 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
20402056
MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
20412057

20422058
for (unsigned L = 0; L < NumLanes; ++L) {
2043-
auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
2044-
UnmergeToEltTy.getReg(I * NumLanes + L));
2059+
Register Op0 = constrainRegToBank(MRI, B, InsRegs[L], DstBank);
2060+
Register Op1 = UnmergeToEltTy.getReg(I * NumLanes + L);
2061+
Op1 = constrainRegToBank(MRI, B, Op1, DstBank);
20452062

2046-
for (unsigned N : { 0, 2, 3 })
2047-
MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
2063+
Register Select = B.buildSelect(EltTy, Cmp, Op0, Op1).getReg(0);
2064+
MRI.setRegBank(Select, DstBank);
20482065

2049-
Ops[I * NumLanes + L] = S->getOperand(0).getReg();
2066+
Ops[I * NumLanes + L] = Select;
20502067
}
20512068
}
20522069

llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll

Lines changed: 86 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,18 @@
77
define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 inreg %idx) {
88
; GFX9-LABEL: insertelement_s_v2i8_s_s:
99
; GFX9: ; %bb.0:
10-
; GFX9-NEXT: v_mov_b32_e32 v1, 0
11-
; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
12-
; GFX9-NEXT: v_mov_b32_e32 v0, s4
10+
; GFX9-NEXT: v_mov_b32_e32 v0, 0
11+
; GFX9-NEXT: global_load_ushort v0, v0, s[2:3]
12+
; GFX9-NEXT: v_mov_b32_e32 v1, s4
1313
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
1414
; GFX9-NEXT: s_waitcnt vmcnt(0)
15-
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1
16-
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
15+
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
16+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
1717
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
18-
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
19-
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
20-
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
21-
; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
18+
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
19+
; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
20+
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
21+
; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
2222
; GFX9-NEXT: v_mov_b32_e32 v0, 0
2323
; GFX9-NEXT: v_mov_b32_e32 v1, 0
2424
; GFX9-NEXT: global_store_short v[0:1], v2, off
@@ -29,13 +29,13 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %pt
2929
; GFX8-NEXT: v_mov_b32_e32 v0, s2
3030
; GFX8-NEXT: v_mov_b32_e32 v1, s3
3131
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
32-
; GFX8-NEXT: v_mov_b32_e32 v2, s4
32+
; GFX8-NEXT: v_mov_b32_e32 v1, s4
3333
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
3434
; GFX8-NEXT: s_waitcnt vmcnt(0)
35-
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0
36-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
35+
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0
36+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
3737
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
38-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
38+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
3939
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
4040
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
4141
; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -50,36 +50,35 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %pt
5050
; GFX7-NEXT: s_mov_b32 s1, s3
5151
; GFX7-NEXT: s_mov_b32 s2, -1
5252
; GFX7-NEXT: s_mov_b32 s3, 0xf000
53-
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
54-
; GFX7-NEXT: v_mov_b32_e32 v0, s4
53+
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], 0
54+
; GFX7-NEXT: v_mov_b32_e32 v2, s4
5555
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
56-
; GFX7-NEXT: v_mov_b32_e32 v2, 0xff
56+
; GFX7-NEXT: v_mov_b32_e32 v1, 0xff
5757
; GFX7-NEXT: s_mov_b64 s[0:1], 0
5858
; GFX7-NEXT: s_waitcnt vmcnt(0)
59-
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v1
60-
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
59+
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0
60+
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
6161
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
62-
; GFX7-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
63-
; GFX7-NEXT: v_and_b32_e32 v0, v0, v2
64-
; GFX7-NEXT: v_and_b32_e32 v1, v1, v2
65-
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
66-
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
62+
; GFX7-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
63+
; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
64+
; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
65+
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
66+
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
6767
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
6868
; GFX7-NEXT: s_endpgm
6969
;
7070
; GFX10-LABEL: insertelement_s_v2i8_s_s:
7171
; GFX10: ; %bb.0:
72-
; GFX10-NEXT: v_mov_b32_e32 v1, 0
73-
; GFX10-NEXT: v_mov_b32_e32 v0, s4
74-
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1
75-
; GFX10-NEXT: s_movk_i32 s0, 0xff
76-
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
72+
; GFX10-NEXT: v_mov_b32_e32 v0, 0
73+
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1
74+
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 0
75+
; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
7776
; GFX10-NEXT: s_waitcnt vmcnt(0)
78-
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
79-
; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
80-
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0
81-
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
82-
; GFX10-NEXT: v_and_b32_sdwa v1, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
77+
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
78+
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s1
79+
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
80+
; GFX10-NEXT: s_movk_i32 s0, 0xff
81+
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
8382
; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
8483
; GFX10-NEXT: v_mov_b32_e32 v0, 0
8584
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -95,13 +94,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
9594
; GFX9-LABEL: insertelement_v_v2i8_s_s:
9695
; GFX9: ; %bb.0:
9796
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
98-
; GFX9-NEXT: v_mov_b32_e32 v2, s2
97+
; GFX9-NEXT: v_mov_b32_e32 v1, s2
9998
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
10099
; GFX9-NEXT: s_waitcnt vmcnt(0)
101-
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
102-
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
100+
; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
101+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
103102
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1
104-
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
103+
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
105104
; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
106105
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
107106
; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -113,13 +112,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
113112
; GFX8-LABEL: insertelement_v_v2i8_s_s:
114113
; GFX8: ; %bb.0:
115114
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
116-
; GFX8-NEXT: v_mov_b32_e32 v2, s2
115+
; GFX8-NEXT: v_mov_b32_e32 v1, s2
117116
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
118117
; GFX8-NEXT: s_waitcnt vmcnt(0)
119-
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0
120-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
118+
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v0
119+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
121120
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1
122-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
121+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
123122
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
124123
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
125124
; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -153,15 +152,14 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
153152
; GFX10-LABEL: insertelement_v_v2i8_s_s:
154153
; GFX10: ; %bb.0:
155154
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
156-
; GFX10-NEXT: v_mov_b32_e32 v2, s2
157-
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1
158-
; GFX10-NEXT: s_movk_i32 s0, 0xff
155+
; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1
156+
; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 0
159157
; GFX10-NEXT: s_waitcnt vmcnt(0)
160158
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
161-
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
162-
; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 0
159+
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s1
160+
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0
161+
; GFX10-NEXT: s_movk_i32 s0, 0xff
163162
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
164-
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
165163
; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
166164
; GFX10-NEXT: v_mov_b32_e32 v0, 0
167165
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -259,18 +257,18 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %pt
259257
define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %ptr, i8 inreg %val, i32 %idx) {
260258
; GFX9-LABEL: insertelement_s_v2i8_s_v:
261259
; GFX9: ; %bb.0:
262-
; GFX9-NEXT: v_mov_b32_e32 v2, 0
263-
; GFX9-NEXT: global_load_ushort v2, v2, s[2:3]
264-
; GFX9-NEXT: v_mov_b32_e32 v1, s4
260+
; GFX9-NEXT: v_mov_b32_e32 v1, 0
261+
; GFX9-NEXT: global_load_ushort v1, v1, s[2:3]
262+
; GFX9-NEXT: v_mov_b32_e32 v2, s4
265263
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
266264
; GFX9-NEXT: s_waitcnt vmcnt(0)
267-
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2
268-
; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
265+
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
266+
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
269267
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
270-
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
268+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
271269
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
272270
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
273-
; GFX9-NEXT: v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
271+
; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
274272
; GFX9-NEXT: v_mov_b32_e32 v0, 0
275273
; GFX9-NEXT: v_mov_b32_e32 v1, 0
276274
; GFX9-NEXT: global_store_short v[0:1], v2, off
@@ -281,13 +279,13 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
281279
; GFX8-NEXT: v_mov_b32_e32 v1, s2
282280
; GFX8-NEXT: v_mov_b32_e32 v2, s3
283281
; GFX8-NEXT: flat_load_ushort v1, v[1:2]
284-
; GFX8-NEXT: v_mov_b32_e32 v3, s4
282+
; GFX8-NEXT: v_mov_b32_e32 v2, s4
285283
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
286284
; GFX8-NEXT: s_waitcnt vmcnt(0)
287-
; GFX8-NEXT: v_lshrrev_b32_e32 v2, 8, v1
288-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
285+
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v1
286+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
289287
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
290-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
288+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
291289
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
292290
; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
293291
; GFX8-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -302,36 +300,35 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
302300
; GFX7-NEXT: s_mov_b32 s1, s3
303301
; GFX7-NEXT: s_mov_b32 s2, -1
304302
; GFX7-NEXT: s_mov_b32 s3, 0xf000
305-
; GFX7-NEXT: buffer_load_ushort v2, off, s[0:3], 0
306-
; GFX7-NEXT: v_mov_b32_e32 v1, s4
303+
; GFX7-NEXT: buffer_load_ushort v1, off, s[0:3], 0
304+
; GFX7-NEXT: v_mov_b32_e32 v3, s4
307305
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
308-
; GFX7-NEXT: v_mov_b32_e32 v3, 0xff
306+
; GFX7-NEXT: v_mov_b32_e32 v2, 0xff
309307
; GFX7-NEXT: s_mov_b64 s[0:1], 0
310308
; GFX7-NEXT: s_waitcnt vmcnt(0)
311-
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v2
312-
; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v1, vcc
309+
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v1
310+
; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
313311
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
314-
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
315-
; GFX7-NEXT: v_and_b32_e32 v0, v0, v3
316-
; GFX7-NEXT: v_and_b32_e32 v1, v2, v3
312+
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc
313+
; GFX7-NEXT: v_and_b32_e32 v0, v0, v2
314+
; GFX7-NEXT: v_and_b32_e32 v1, v1, v2
317315
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
318316
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
319317
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
320318
; GFX7-NEXT: s_endpgm
321319
;
322320
; GFX10-LABEL: insertelement_s_v2i8_s_v:
323321
; GFX10: ; %bb.0:
324-
; GFX10-NEXT: v_mov_b32_e32 v2, 0
325-
; GFX10-NEXT: v_mov_b32_e32 v1, s4
322+
; GFX10-NEXT: v_mov_b32_e32 v1, 0
326323
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
327324
; GFX10-NEXT: s_movk_i32 s0, 0xff
328-
; GFX10-NEXT: global_load_ushort v2, v2, s[2:3]
325+
; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
329326
; GFX10-NEXT: s_waitcnt vmcnt(0)
330-
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2
331-
; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo
327+
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1
328+
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo
332329
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
333-
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo
334-
; GFX10-NEXT: v_and_b32_sdwa v1, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
330+
; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s4, vcc_lo
331+
; GFX10-NEXT: v_and_b32_sdwa v1, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
335332
; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
336333
; GFX10-NEXT: v_mov_b32_e32 v0, 0
337334
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -430,13 +427,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
430427
; GFX9-LABEL: insertelement_v_v2i8_s_v:
431428
; GFX9: ; %bb.0:
432429
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
433-
; GFX9-NEXT: v_mov_b32_e32 v3, s2
430+
; GFX9-NEXT: v_mov_b32_e32 v1, s2
434431
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
435432
; GFX9-NEXT: s_waitcnt vmcnt(0)
436-
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v0
437-
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
433+
; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v0
434+
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
438435
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
439-
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
436+
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
440437
; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
441438
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
442439
; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -448,13 +445,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
448445
; GFX8-LABEL: insertelement_v_v2i8_s_v:
449446
; GFX8: ; %bb.0:
450447
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
451-
; GFX8-NEXT: v_mov_b32_e32 v3, s2
448+
; GFX8-NEXT: v_mov_b32_e32 v1, s2
452449
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
453450
; GFX8-NEXT: s_waitcnt vmcnt(0)
454-
; GFX8-NEXT: v_lshrrev_b32_e32 v1, 8, v0
455-
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
451+
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v0
452+
; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
456453
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
457-
; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
454+
; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
458455
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
459456
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
460457
; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -465,14 +462,14 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
465462
;
466463
; GFX7-LABEL: insertelement_v_v2i8_s_v:
467464
; GFX7: ; %bb.0:
465+
; GFX7-NEXT: s_mov_b32 s6, 0
466+
; GFX7-NEXT: s_mov_b32 s7, 0xf000
467+
; GFX7-NEXT: s_mov_b64 s[4:5], 0
468+
; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
468469
; GFX7-NEXT: v_mov_b32_e32 v3, s2
469-
; GFX7-NEXT: s_mov_b32 s2, 0
470-
; GFX7-NEXT: s_mov_b32 s3, 0xf000
471-
; GFX7-NEXT: s_mov_b64 s[0:1], 0
472-
; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
473470
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
474471
; GFX7-NEXT: v_mov_b32_e32 v1, 0xff
475-
; GFX7-NEXT: s_mov_b32 s2, -1
472+
; GFX7-NEXT: s_mov_b32 s6, -1
476473
; GFX7-NEXT: s_waitcnt vmcnt(0)
477474
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0
478475
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
@@ -482,21 +479,20 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
482479
; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
483480
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
484481
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
485-
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
482+
; GFX7-NEXT: buffer_store_short v0, off, s[4:7], 0
486483
; GFX7-NEXT: s_endpgm
487484
;
488485
; GFX10-LABEL: insertelement_v_v2i8_s_v:
489486
; GFX10: ; %bb.0:
490487
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
491-
; GFX10-NEXT: v_mov_b32_e32 v3, s2
492488
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
493489
; GFX10-NEXT: s_movk_i32 s0, 0xff
494490
; GFX10-NEXT: s_waitcnt vmcnt(0)
495491
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
496-
; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
492+
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo
497493
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
498494
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
499-
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
495+
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo
500496
; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
501497
; GFX10-NEXT: v_mov_b32_e32 v0, 0
502498
; GFX10-NEXT: v_mov_b32_e32 v1, 0

0 commit comments

Comments
 (0)