7
7
define amdgpu_ps void @insertelement_s_v2i8_s_s (<2 x i8 > addrspace (4 )* inreg %ptr , i8 inreg %val , i32 inreg %idx ) {
8
8
; GFX9-LABEL: insertelement_s_v2i8_s_s:
9
9
; GFX9: ; %bb.0:
10
- ; GFX9-NEXT: v_mov_b32_e32 v1 , 0
11
- ; GFX9-NEXT: global_load_ushort v1, v1 , s[2:3]
12
- ; GFX9-NEXT: v_mov_b32_e32 v0 , s4
10
+ ; GFX9-NEXT: v_mov_b32_e32 v0 , 0
11
+ ; GFX9-NEXT: global_load_ushort v0, v0 , s[2:3]
12
+ ; GFX9-NEXT: v_mov_b32_e32 v1 , s4
13
13
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
14
14
; GFX9-NEXT: s_waitcnt vmcnt(0)
15
- ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v1
16
- ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v0 , vcc
15
+ ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 8, v0
16
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1 , vcc
17
17
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
18
- ; GFX9-NEXT: v_cndmask_b32_e32 v0 , v2, v0 , vcc
19
- ; GFX9-NEXT: v_and_b32_e32 v0 , 0xff, v0
20
- ; GFX9-NEXT: v_lshlrev_b16_e32 v0 , 8, v0
21
- ; GFX9-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
18
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1 , v2, v1 , vcc
19
+ ; GFX9-NEXT: v_and_b32_e32 v1 , 0xff, v1
20
+ ; GFX9-NEXT: v_lshlrev_b16_e32 v1 , 8, v1
21
+ ; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
22
22
; GFX9-NEXT: v_mov_b32_e32 v0, 0
23
23
; GFX9-NEXT: v_mov_b32_e32 v1, 0
24
24
; GFX9-NEXT: global_store_short v[0:1], v2, off
@@ -29,13 +29,13 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %pt
29
29
; GFX8-NEXT: v_mov_b32_e32 v0, s2
30
30
; GFX8-NEXT: v_mov_b32_e32 v1, s3
31
31
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
32
- ; GFX8-NEXT: v_mov_b32_e32 v2 , s4
32
+ ; GFX8-NEXT: v_mov_b32_e32 v1 , s4
33
33
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
34
34
; GFX8-NEXT: s_waitcnt vmcnt(0)
35
- ; GFX8-NEXT: v_lshrrev_b32_e32 v1 , 8, v0
36
- ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2 , vcc
35
+ ; GFX8-NEXT: v_lshrrev_b32_e32 v2 , 8, v0
36
+ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1 , vcc
37
37
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
38
- ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2 , vcc
38
+ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1 , vcc
39
39
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
40
40
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
41
41
; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -50,36 +50,35 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(<2 x i8> addrspace(4)* inreg %pt
50
50
; GFX7-NEXT: s_mov_b32 s1, s3
51
51
; GFX7-NEXT: s_mov_b32 s2, -1
52
52
; GFX7-NEXT: s_mov_b32 s3, 0xf000
53
- ; GFX7-NEXT: buffer_load_ushort v1 , off, s[0:3], 0
54
- ; GFX7-NEXT: v_mov_b32_e32 v0 , s4
53
+ ; GFX7-NEXT: buffer_load_ushort v0 , off, s[0:3], 0
54
+ ; GFX7-NEXT: v_mov_b32_e32 v2 , s4
55
55
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 0
56
- ; GFX7-NEXT: v_mov_b32_e32 v2 , 0xff
56
+ ; GFX7-NEXT: v_mov_b32_e32 v1 , 0xff
57
57
; GFX7-NEXT: s_mov_b64 s[0:1], 0
58
58
; GFX7-NEXT: s_waitcnt vmcnt(0)
59
- ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v1
60
- ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v0 , vcc
59
+ ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 8, v0
60
+ ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v2 , vcc
61
61
; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s5, 1
62
- ; GFX7-NEXT: v_cndmask_b32_e32 v0 , v3, v0 , vcc
63
- ; GFX7-NEXT: v_and_b32_e32 v0, v0, v2
64
- ; GFX7-NEXT: v_and_b32_e32 v1, v1, v2
65
- ; GFX7-NEXT: v_lshlrev_b32_e32 v0 , 8, v0
66
- ; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
62
+ ; GFX7-NEXT: v_cndmask_b32_e32 v2 , v3, v2 , vcc
63
+ ; GFX7-NEXT: v_and_b32_e32 v0, v0, v1
64
+ ; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
65
+ ; GFX7-NEXT: v_lshlrev_b32_e32 v1 , 8, v1
66
+ ; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
67
67
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
68
68
; GFX7-NEXT: s_endpgm
69
69
;
70
70
; GFX10-LABEL: insertelement_s_v2i8_s_s:
71
71
; GFX10: ; %bb.0:
72
- ; GFX10-NEXT: v_mov_b32_e32 v1, 0
73
- ; GFX10-NEXT: v_mov_b32_e32 v0, s4
74
- ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1
75
- ; GFX10-NEXT: s_movk_i32 s0, 0xff
76
- ; GFX10-NEXT: global_load_ushort v1, v1, s[2:3]
72
+ ; GFX10-NEXT: v_mov_b32_e32 v0, 0
73
+ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s5, 1
74
+ ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s5, 0
75
+ ; GFX10-NEXT: global_load_ushort v0, v0, s[2:3]
77
76
; GFX10-NEXT: s_waitcnt vmcnt(0)
78
- ; GFX10-NEXT: v_lshrrev_b32_e32 v2 , 8, v1
79
- ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo
80
- ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0
81
- ; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
82
- ; GFX10-NEXT: v_and_b32_sdwa v1, v2 , s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
77
+ ; GFX10-NEXT: v_lshrrev_b32_e32 v1 , 8, v0
78
+ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s1
79
+ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0
80
+ ; GFX10-NEXT: s_movk_i32 s0, 0xff
81
+ ; GFX10-NEXT: v_and_b32_sdwa v1, v1 , s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
83
82
; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
84
83
; GFX10-NEXT: v_mov_b32_e32 v0, 0
85
84
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -95,13 +94,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
95
94
; GFX9-LABEL: insertelement_v_v2i8_s_s:
96
95
; GFX9: ; %bb.0:
97
96
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
98
- ; GFX9-NEXT: v_mov_b32_e32 v2 , s2
97
+ ; GFX9-NEXT: v_mov_b32_e32 v1 , s2
99
98
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
100
99
; GFX9-NEXT: s_waitcnt vmcnt(0)
101
- ; GFX9-NEXT: v_lshrrev_b32_e32 v1 , 8, v0
102
- ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2 , vcc
100
+ ; GFX9-NEXT: v_lshrrev_b32_e32 v2 , 8, v0
101
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1 , vcc
103
102
; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1
104
- ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2 , vcc
103
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1 , vcc
105
104
; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
106
105
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
107
106
; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -113,13 +112,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
113
112
; GFX8-LABEL: insertelement_v_v2i8_s_s:
114
113
; GFX8: ; %bb.0:
115
114
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
116
- ; GFX8-NEXT: v_mov_b32_e32 v2 , s2
115
+ ; GFX8-NEXT: v_mov_b32_e32 v1 , s2
117
116
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 0
118
117
; GFX8-NEXT: s_waitcnt vmcnt(0)
119
- ; GFX8-NEXT: v_lshrrev_b32_e32 v1 , 8, v0
120
- ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2 , vcc
118
+ ; GFX8-NEXT: v_lshrrev_b32_e32 v2 , 8, v0
119
+ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1 , vcc
121
120
; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s3, 1
122
- ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2 , vcc
121
+ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1 , vcc
123
122
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
124
123
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
125
124
; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -153,15 +152,14 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(<2 x i8> addrspace(1)* %ptr, i8
153
152
; GFX10-LABEL: insertelement_v_v2i8_s_s:
154
153
; GFX10: ; %bb.0:
155
154
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
156
- ; GFX10-NEXT: v_mov_b32_e32 v2, s2
157
- ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 1
158
- ; GFX10-NEXT: s_movk_i32 s0, 0xff
155
+ ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s3, 1
156
+ ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s3, 0
159
157
; GFX10-NEXT: s_waitcnt vmcnt(0)
160
158
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
161
- ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc_lo
162
- ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s3, 0
159
+ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s1
160
+ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0
161
+ ; GFX10-NEXT: s_movk_i32 s0, 0xff
163
162
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
164
- ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc_lo
165
163
; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
166
164
; GFX10-NEXT: v_mov_b32_e32 v0, 0
167
165
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -259,18 +257,18 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(<2 x i8> addrspace(4)* inreg %pt
259
257
define amdgpu_ps void @insertelement_s_v2i8_s_v (<2 x i8 > addrspace (4 )* inreg %ptr , i8 inreg %val , i32 %idx ) {
260
258
; GFX9-LABEL: insertelement_s_v2i8_s_v:
261
259
; GFX9: ; %bb.0:
262
- ; GFX9-NEXT: v_mov_b32_e32 v2 , 0
263
- ; GFX9-NEXT: global_load_ushort v2, v2 , s[2:3]
264
- ; GFX9-NEXT: v_mov_b32_e32 v1 , s4
260
+ ; GFX9-NEXT: v_mov_b32_e32 v1 , 0
261
+ ; GFX9-NEXT: global_load_ushort v1, v1 , s[2:3]
262
+ ; GFX9-NEXT: v_mov_b32_e32 v2 , s4
265
263
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
266
264
; GFX9-NEXT: s_waitcnt vmcnt(0)
267
- ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v2
268
- ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v1 , vcc
265
+ ; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v1
266
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v2 , vcc
269
267
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
270
- ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v1 , vcc
268
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v3, v2 , vcc
271
269
; GFX9-NEXT: v_and_b32_e32 v0, 0xff, v0
272
270
; GFX9-NEXT: v_lshlrev_b16_e32 v0, 8, v0
273
- ; GFX9-NEXT: v_or_b32_sdwa v2, v2 , v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
271
+ ; GFX9-NEXT: v_or_b32_sdwa v2, v1 , v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
274
272
; GFX9-NEXT: v_mov_b32_e32 v0, 0
275
273
; GFX9-NEXT: v_mov_b32_e32 v1, 0
276
274
; GFX9-NEXT: global_store_short v[0:1], v2, off
@@ -281,13 +279,13 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
281
279
; GFX8-NEXT: v_mov_b32_e32 v1, s2
282
280
; GFX8-NEXT: v_mov_b32_e32 v2, s3
283
281
; GFX8-NEXT: flat_load_ushort v1, v[1:2]
284
- ; GFX8-NEXT: v_mov_b32_e32 v3 , s4
282
+ ; GFX8-NEXT: v_mov_b32_e32 v2 , s4
285
283
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
286
284
; GFX8-NEXT: s_waitcnt vmcnt(0)
287
- ; GFX8-NEXT: v_lshrrev_b32_e32 v2 , 8, v1
288
- ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3 , vcc
285
+ ; GFX8-NEXT: v_lshrrev_b32_e32 v3 , 8, v1
286
+ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v2 , vcc
289
287
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
290
- ; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3 , vcc
288
+ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v2 , vcc
291
289
; GFX8-NEXT: v_and_b32_e32 v0, 0xff, v0
292
290
; GFX8-NEXT: v_lshlrev_b16_e32 v0, 8, v0
293
291
; GFX8-NEXT: v_or_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -302,36 +300,35 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(<2 x i8> addrspace(4)* inreg %pt
302
300
; GFX7-NEXT: s_mov_b32 s1, s3
303
301
; GFX7-NEXT: s_mov_b32 s2, -1
304
302
; GFX7-NEXT: s_mov_b32 s3, 0xf000
305
- ; GFX7-NEXT: buffer_load_ushort v2 , off, s[0:3], 0
306
- ; GFX7-NEXT: v_mov_b32_e32 v1 , s4
303
+ ; GFX7-NEXT: buffer_load_ushort v1 , off, s[0:3], 0
304
+ ; GFX7-NEXT: v_mov_b32_e32 v3 , s4
307
305
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
308
- ; GFX7-NEXT: v_mov_b32_e32 v3 , 0xff
306
+ ; GFX7-NEXT: v_mov_b32_e32 v2 , 0xff
309
307
; GFX7-NEXT: s_mov_b64 s[0:1], 0
310
308
; GFX7-NEXT: s_waitcnt vmcnt(0)
311
- ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v2
312
- ; GFX7-NEXT: v_cndmask_b32_e32 v2, v2, v1 , vcc
309
+ ; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v1
310
+ ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v3 , vcc
313
311
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
314
- ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v1 , vcc
315
- ; GFX7-NEXT: v_and_b32_e32 v0, v0, v3
316
- ; GFX7-NEXT: v_and_b32_e32 v1, v2, v3
312
+ ; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v3 , vcc
313
+ ; GFX7-NEXT: v_and_b32_e32 v0, v0, v2
314
+ ; GFX7-NEXT: v_and_b32_e32 v1, v1, v2
317
315
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 8, v0
318
316
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
319
317
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], 0
320
318
; GFX7-NEXT: s_endpgm
321
319
;
322
320
; GFX10-LABEL: insertelement_s_v2i8_s_v:
323
321
; GFX10: ; %bb.0:
324
- ; GFX10-NEXT: v_mov_b32_e32 v2, 0
325
- ; GFX10-NEXT: v_mov_b32_e32 v1, s4
322
+ ; GFX10-NEXT: v_mov_b32_e32 v1, 0
326
323
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
327
324
; GFX10-NEXT: s_movk_i32 s0, 0xff
328
- ; GFX10-NEXT: global_load_ushort v2, v2 , s[2:3]
325
+ ; GFX10-NEXT: global_load_ushort v1, v1 , s[2:3]
329
326
; GFX10-NEXT: s_waitcnt vmcnt(0)
330
- ; GFX10-NEXT: v_lshrrev_b32_e32 v3 , 8, v2
331
- ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v1 , vcc_lo
327
+ ; GFX10-NEXT: v_lshrrev_b32_e32 v2 , 8, v1
328
+ ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4 , vcc_lo
332
329
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
333
- ; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v1 , vcc_lo
334
- ; GFX10-NEXT: v_and_b32_sdwa v1, v3 , s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
330
+ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v1, s4 , vcc_lo
331
+ ; GFX10-NEXT: v_and_b32_sdwa v1, v2 , s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
335
332
; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
336
333
; GFX10-NEXT: v_mov_b32_e32 v0, 0
337
334
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -430,13 +427,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
430
427
; GFX9-LABEL: insertelement_v_v2i8_s_v:
431
428
; GFX9: ; %bb.0:
432
429
; GFX9-NEXT: global_load_ushort v0, v[0:1], off
433
- ; GFX9-NEXT: v_mov_b32_e32 v3 , s2
430
+ ; GFX9-NEXT: v_mov_b32_e32 v1 , s2
434
431
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
435
432
; GFX9-NEXT: s_waitcnt vmcnt(0)
436
- ; GFX9-NEXT: v_lshrrev_b32_e32 v1 , 8, v0
437
- ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3 , vcc
433
+ ; GFX9-NEXT: v_lshrrev_b32_e32 v3 , 8, v0
434
+ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v1 , vcc
438
435
; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
439
- ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3 , vcc
436
+ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1 , vcc
440
437
; GFX9-NEXT: v_and_b32_e32 v1, 0xff, v1
441
438
; GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1
442
439
; GFX9-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -448,13 +445,13 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
448
445
; GFX8-LABEL: insertelement_v_v2i8_s_v:
449
446
; GFX8: ; %bb.0:
450
447
; GFX8-NEXT: flat_load_ushort v0, v[0:1]
451
- ; GFX8-NEXT: v_mov_b32_e32 v3 , s2
448
+ ; GFX8-NEXT: v_mov_b32_e32 v1 , s2
452
449
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
453
450
; GFX8-NEXT: s_waitcnt vmcnt(0)
454
- ; GFX8-NEXT: v_lshrrev_b32_e32 v1 , 8, v0
455
- ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v3 , vcc
451
+ ; GFX8-NEXT: v_lshrrev_b32_e32 v3 , 8, v0
452
+ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v1 , vcc
456
453
; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
457
- ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v3 , vcc
454
+ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1 , vcc
458
455
; GFX8-NEXT: v_and_b32_e32 v1, 0xff, v1
459
456
; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1
460
457
; GFX8-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -465,14 +462,14 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
465
462
;
466
463
; GFX7-LABEL: insertelement_v_v2i8_s_v:
467
464
; GFX7: ; %bb.0:
465
+ ; GFX7-NEXT: s_mov_b32 s6, 0
466
+ ; GFX7-NEXT: s_mov_b32 s7, 0xf000
467
+ ; GFX7-NEXT: s_mov_b64 s[4:5], 0
468
+ ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
468
469
; GFX7-NEXT: v_mov_b32_e32 v3, s2
469
- ; GFX7-NEXT: s_mov_b32 s2, 0
470
- ; GFX7-NEXT: s_mov_b32 s3, 0xf000
471
- ; GFX7-NEXT: s_mov_b64 s[0:1], 0
472
- ; GFX7-NEXT: buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
473
470
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2
474
471
; GFX7-NEXT: v_mov_b32_e32 v1, 0xff
475
- ; GFX7-NEXT: s_mov_b32 s2 , -1
472
+ ; GFX7-NEXT: s_mov_b32 s6 , -1
476
473
; GFX7-NEXT: s_waitcnt vmcnt(0)
477
474
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 8, v0
478
475
; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
@@ -482,21 +479,20 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(<2 x i8> addrspace(1)* %ptr, i8
482
479
; GFX7-NEXT: v_and_b32_e32 v1, v2, v1
483
480
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 8, v1
484
481
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
485
- ; GFX7-NEXT: buffer_store_short v0, off, s[0:3 ], 0
482
+ ; GFX7-NEXT: buffer_store_short v0, off, s[4:7 ], 0
486
483
; GFX7-NEXT: s_endpgm
487
484
;
488
485
; GFX10-LABEL: insertelement_v_v2i8_s_v:
489
486
; GFX10: ; %bb.0:
490
487
; GFX10-NEXT: global_load_ushort v0, v[0:1], off
491
- ; GFX10-NEXT: v_mov_b32_e32 v3, s2
492
488
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2
493
489
; GFX10-NEXT: s_movk_i32 s0, 0xff
494
490
; GFX10-NEXT: s_waitcnt vmcnt(0)
495
491
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0
496
- ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v3 , vcc_lo
492
+ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2 , vcc_lo
497
493
; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2
498
494
; GFX10-NEXT: v_and_b32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
499
- ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3 , vcc_lo
495
+ ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2 , vcc_lo
500
496
; GFX10-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
501
497
; GFX10-NEXT: v_mov_b32_e32 v0, 0
502
498
; GFX10-NEXT: v_mov_b32_e32 v1, 0
0 commit comments