@@ -159,93 +159,85 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
159
159
; CI: ; %bb.0:
160
160
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
161
161
; CI-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
162
- ; CI-NEXT: v_ashrrev_i32_e32 v11 , 31, v0
162
+ ; CI-NEXT: v_ashrrev_i32_e32 v13 , 31, v0
163
163
; CI-NEXT: v_mov_b32_e32 v8, 0
164
- ; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v11 , v1, v[7:8]
165
- ; CI-NEXT: v_ashrrev_i32_e32 v12 , 31, v1
166
- ; CI-NEXT: v_and_b32_e32 v14, v11 , v1
167
- ; CI-NEXT: v_mov_b32_e32 v1 , v10
164
+ ; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13 , v1, v[7:8]
165
+ ; CI-NEXT: v_ashrrev_i32_e32 v14 , 31, v1
166
+ ; CI-NEXT: v_mad_i64_i32 v[11:12], s[4:5] , v1, v13, 0
167
+ ; CI-NEXT: v_mov_b32_e32 v7 , v10
168
168
; CI-NEXT: v_mov_b32_e32 v10, v8
169
- ; CI-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v0, v12, v[9:10]
170
- ; CI-NEXT: v_and_b32_e32 v13, v11, v12
171
- ; CI-NEXT: v_sub_i32_e32 v9, vcc, 0, v14
172
- ; CI-NEXT: v_subb_u32_e32 v10, vcc, 0, v13, vcc
173
- ; CI-NEXT: v_mad_i64_i32 v[9:10], s[4:5], v12, v0, v[9:10]
174
- ; CI-NEXT: v_mov_b32_e32 v0, v8
175
- ; CI-NEXT: v_add_i32_e32 v0, vcc, v1, v0
176
- ; CI-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
177
- ; CI-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v11, v12, v[0:1]
178
- ; CI-NEXT: v_add_i32_e32 v8, vcc, v0, v9
179
- ; CI-NEXT: v_addc_u32_e32 v9, vcc, v1, v10, vcc
180
- ; CI-NEXT: v_mov_b32_e32 v1, v7
169
+ ; CI-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v14, v[9:10]
170
+ ; CI-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[11:12]
171
+ ; CI-NEXT: v_add_i32_e32 v9, vcc, v7, v9
172
+ ; CI-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc
173
+ ; CI-NEXT: v_mad_u64_u32 v[9:10], s[4:5], v13, v14, v[9:10]
174
+ ; CI-NEXT: v_add_i32_e32 v7, vcc, v9, v0
175
+ ; CI-NEXT: v_addc_u32_e32 v9, vcc, v10, v1, vcc
176
+ ; CI-NEXT: v_mov_b32_e32 v1, v8
181
177
; CI-NEXT: v_add_i32_e32 v0, vcc, v6, v2
182
178
; CI-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
183
- ; CI-NEXT: v_addc_u32_e32 v2, vcc, v8 , v4, vcc
179
+ ; CI-NEXT: v_addc_u32_e32 v2, vcc, v7 , v4, vcc
184
180
; CI-NEXT: v_addc_u32_e32 v3, vcc, v9, v5, vcc
185
181
; CI-NEXT: s_setpc_b64 s[30:31]
186
182
;
187
183
; SI-LABEL: mad_i64_i32_sextops_i32_i128:
188
184
; SI: ; %bb.0:
189
185
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
190
186
; SI-NEXT: v_ashrrev_i32_e32 v6, 31, v0
187
+ ; SI-NEXT: v_mul_lo_u32 v11, v6, v1
188
+ ; SI-NEXT: v_mul_hi_u32 v12, v0, v1
191
189
; SI-NEXT: v_ashrrev_i32_e32 v7, 31, v1
192
- ; SI-NEXT: v_and_b32_e32 v9, v6, v1
193
- ; SI-NEXT: v_and_b32_e32 v10, v7, v0
194
- ; SI-NEXT: v_mul_lo_u32 v13, v6, v1
195
- ; SI-NEXT: v_mul_hi_u32 v14, v0, v1
196
- ; SI-NEXT: v_and_b32_e32 v8, v6, v7
197
- ; SI-NEXT: v_add_i32_e32 v9, vcc, v10, v9
198
- ; SI-NEXT: v_mul_hi_u32 v10, v6, v7
199
- ; SI-NEXT: v_mul_i32_i24_e32 v11, v6, v7
200
- ; SI-NEXT: v_mul_hi_u32 v6, v6, v1
201
- ; SI-NEXT: v_mul_hi_u32 v12, v0, v7
202
- ; SI-NEXT: v_mul_lo_u32 v7, v0, v7
203
- ; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v8, vcc
204
- ; SI-NEXT: v_add_i32_e32 v13, vcc, v13, v14
205
- ; SI-NEXT: v_addc_u32_e32 v6, vcc, 0, v6, vcc
206
- ; SI-NEXT: v_add_i32_e32 v7, vcc, v7, v13
207
- ; SI-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc
208
- ; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v12
209
- ; SI-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, vcc
210
- ; SI-NEXT: v_add_i32_e32 v6, vcc, v11, v6
190
+ ; SI-NEXT: v_mul_hi_u32 v14, v6, v1
191
+ ; SI-NEXT: v_mul_lo_u32 v13, v0, v7
192
+ ; SI-NEXT: v_mul_hi_u32 v10, v0, v7
193
+ ; SI-NEXT: v_add_i32_e32 v12, vcc, v11, v12
194
+ ; SI-NEXT: v_addc_u32_e32 v14, vcc, 0, v14, vcc
195
+ ; SI-NEXT: v_mul_hi_u32 v8, v6, v7
196
+ ; SI-NEXT: v_add_i32_e32 v12, vcc, v13, v12
197
+ ; SI-NEXT: v_addc_u32_e32 v10, vcc, 0, v10, vcc
198
+ ; SI-NEXT: v_mul_i32_i24_e32 v9, v6, v7
199
+ ; SI-NEXT: v_add_i32_e32 v10, vcc, v14, v10
200
+ ; SI-NEXT: v_mul_hi_i32 v6, v1, v6
201
+ ; SI-NEXT: v_mul_hi_i32 v7, v7, v0
202
+ ; SI-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, vcc
203
+ ; SI-NEXT: v_add_i32_e32 v9, vcc, v9, v10
204
+ ; SI-NEXT: v_addc_u32_e32 v8, vcc, v8, v14, vcc
205
+ ; SI-NEXT: v_add_i32_e32 v10, vcc, v13, v11
211
206
; SI-NEXT: v_mul_lo_u32 v0, v0, v1
212
- ; SI-NEXT: v_addc_u32_e32 v10 , vcc, v10, v12 , vcc
213
- ; SI-NEXT: v_sub_i32_e32 v6 , vcc, v6, v9
214
- ; SI-NEXT: v_subb_u32_e32 v8 , vcc, v10, v8 , vcc
207
+ ; SI-NEXT: v_addc_u32_e32 v6 , vcc, v7, v6 , vcc
208
+ ; SI-NEXT: v_add_i32_e32 v7 , vcc, v9, v10
209
+ ; SI-NEXT: v_addc_u32_e32 v6 , vcc, v8, v6 , vcc
215
210
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
216
- ; SI-NEXT: v_addc_u32_e32 v1, vcc, v7 , v3, vcc
217
- ; SI-NEXT: v_addc_u32_e32 v2, vcc, v6 , v4, vcc
218
- ; SI-NEXT: v_addc_u32_e32 v3, vcc, v8 , v5, vcc
211
+ ; SI-NEXT: v_addc_u32_e32 v1, vcc, v12 , v3, vcc
212
+ ; SI-NEXT: v_addc_u32_e32 v2, vcc, v7 , v4, vcc
213
+ ; SI-NEXT: v_addc_u32_e32 v3, vcc, v6 , v5, vcc
219
214
; SI-NEXT: s_setpc_b64 s[30:31]
220
215
;
221
216
; GFX9-LABEL: mad_i64_i32_sextops_i32_i128:
222
217
; GFX9: ; %bb.0:
223
218
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
224
- ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v0
225
- ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v0, v1, 0
226
- ; GFX9-NEXT: v_ashrrev_i32_e32 v15, 31, v1
227
- ; GFX9-NEXT: v_and_b32_e32 v6, v14, v1
228
- ; GFX9-NEXT: v_mov_b32_e32 v11, 0
229
- ; GFX9-NEXT: v_mov_b32_e32 v10, v9
230
- ; GFX9-NEXT: v_and_b32_e32 v7, v14, v15
231
- ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, 0, v6
232
- ; GFX9-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v14, v1, v[10:11]
233
- ; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, 0, v7, vcc
234
- ; GFX9-NEXT: v_mov_b32_e32 v10, v13
235
- ; GFX9-NEXT: v_mov_b32_e32 v13, v11
236
- ; GFX9-NEXT: v_mad_i64_i32 v[6:7], s[4:5], v15, v0, v[6:7]
237
- ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v0, v15, v[12:13]
238
- ; GFX9-NEXT: v_mov_b32_e32 v12, v1
239
- ; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, v10, v12
240
- ; GFX9-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, 0, vcc
241
- ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v14, v15, v[10:11]
242
- ; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v10, v6
243
- ; GFX9-NEXT: v_addc_co_u32_e32 v7, vcc, v11, v7, vcc
244
- ; GFX9-NEXT: v_mov_b32_e32 v1, v0
245
- ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v8, v2
219
+ ; GFX9-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v0, v1, 0
220
+ ; GFX9-NEXT: v_ashrrev_i32_e32 v13, 31, v0
221
+ ; GFX9-NEXT: v_mov_b32_e32 v9, 0
222
+ ; GFX9-NEXT: v_mov_b32_e32 v8, v7
223
+ ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v13, v1, v[8:9]
224
+ ; GFX9-NEXT: v_ashrrev_i32_e32 v14, 31, v1
225
+ ; GFX9-NEXT: v_mov_b32_e32 v8, v11
226
+ ; GFX9-NEXT: v_mov_b32_e32 v11, v9
227
+ ; GFX9-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v0, v14, v[10:11]
228
+ ; GFX9-NEXT: v_mov_b32_e32 v12, v11
229
+ ; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v8, v12
230
+ ; GFX9-NEXT: v_addc_co_u32_e64 v9, s[4:5], 0, 0, vcc
231
+ ; GFX9-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v13, v14, v[8:9]
232
+ ; GFX9-NEXT: v_mad_i64_i32 v[12:13], s[4:5], v1, v13, 0
233
+ ; GFX9-NEXT: v_mad_i64_i32 v[0:1], s[4:5], v14, v0, v[12:13]
234
+ ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, v8, v0
235
+ ; GFX9-NEXT: v_addc_co_u32_e32 v8, vcc, v9, v1, vcc
236
+ ; GFX9-NEXT: v_mov_b32_e32 v1, v10
237
+ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v6, v2
246
238
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v3, vcc
247
- ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v6 , v4, vcc
248
- ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v7 , v5, vcc
239
+ ; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, v7 , v4, vcc
240
+ ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v8 , v5, vcc
249
241
; GFX9-NEXT: s_setpc_b64 s[30:31]
250
242
;
251
243
; GFX11-LABEL: mad_i64_i32_sextops_i32_i128:
@@ -254,30 +246,27 @@ define i128 @mad_i64_i32_sextops_i32_i128(i32 %arg0, i32 %arg1, i128 %arg2) #0 {
254
246
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
255
247
; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v1, 0
256
248
; GFX11-NEXT: v_mov_b32_e32 v8, 0
257
- ; GFX11-NEXT: v_ashrrev_i32_e32 v16 , 31, v0
258
- ; GFX11-NEXT: v_ashrrev_i32_e32 v17 , 31, v1
249
+ ; GFX11-NEXT: v_ashrrev_i32_e32 v14 , 31, v0
250
+ ; GFX11-NEXT: v_ashrrev_i32_e32 v15 , 31, v1
259
251
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
260
- ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v16 , v1, v[7:8]
252
+ ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14 , v1, v[7:8]
261
253
; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8
262
- ; GFX11-NEXT: v_and_b32_e32 v8, v16, v1
263
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
264
- ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v17, v[9:10]
265
- ; GFX11-NEXT: v_and_b32_e32 v9, v16, v17
266
- ; GFX11-NEXT: v_sub_co_u32 v8, vcc_lo, 0, v8
267
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
268
- ; GFX11-NEXT: v_sub_co_ci_u32_e32 v9, vcc_lo, 0, v9, vcc_lo
269
- ; GFX11-NEXT: v_mov_b32_e32 v1, v12
254
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
255
+ ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10]
256
+ ; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0
257
+ ; GFX11-NEXT: v_mov_b32_e32 v8, v12
270
258
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
271
- ; GFX11-NEXT: v_mad_i64_i32 v[14:15], null, v17, v0, v[8:9]
272
- ; GFX11-NEXT: v_add_co_u32 v12, s0, v7, v1
273
- ; GFX11-NEXT: v_mov_b32_e32 v7, v11
274
- ; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, 0, 0, s0
259
+ ; GFX11-NEXT: v_mad_i64_i32 v[12:13], null, v15, v0, v[9:10]
260
+ ; GFX11-NEXT: v_add_co_u32 v7, s0, v7, v8
275
261
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
276
- ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v16, v17, v[12:13]
277
- ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v14
278
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
279
- ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v15, vcc_lo
262
+ ; GFX11-NEXT: v_add_co_ci_u32_e64 v8, null, 0, 0, s0
263
+ ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v14, v15, v[7:8]
264
+ ; GFX11-NEXT: v_mov_b32_e32 v7, v11
265
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
266
+ ; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v0, v12
267
+ ; GFX11-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, v1, v13, vcc_lo
280
268
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v6, v2
269
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
281
270
; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, v7, v3, vcc_lo
282
271
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, v8, v4, vcc_lo
283
272
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
0 commit comments