@@ -94,8 +94,8 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
94
94
; SSSE3-SLOW-LABEL: pair_sum_v4i32_v4i32:
95
95
; SSSE3-SLOW: # %bb.0:
96
96
; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
97
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2 ,1,3]
98
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3 ,1,3]
97
+ ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3 ,1,3]
98
+ ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2 ,1,3]
99
99
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
100
100
; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3
101
101
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
@@ -115,9 +115,9 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
115
115
; AVX1-SLOW-LABEL: pair_sum_v4i32_v4i32:
116
116
; AVX1-SLOW: # %bb.0:
117
117
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
118
- ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2 ,1,3]
119
- ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3 ,1,3]
120
- ; AVX1-SLOW-NEXT: vpaddd %xmm0 , %xmm1 , %xmm0
118
+ ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3 ,1,3]
119
+ ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2 ,1,3]
120
+ ; AVX1-SLOW-NEXT: vpaddd %xmm1 , %xmm0 , %xmm0
121
121
; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
122
122
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
123
123
; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
@@ -184,57 +184,55 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
184
184
; SSSE3-SLOW: # %bb.0:
185
185
; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
186
186
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
187
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2 ],xmm0[1,3]
188
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3 ,1,3]
187
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3 ],xmm0[1,3]
188
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2 ,1,3]
189
189
; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
190
- ; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
191
- ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm1
192
- ; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3
190
+ ; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
191
+ ; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm1
193
192
; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5
194
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0]
195
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm5[3,1]
196
- ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm3
197
- ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
193
+ ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2
194
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,0]
195
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[3,1]
196
+ ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
197
+ ; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
198
198
; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
199
199
; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
200
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm6[0,3]
201
- ; SSSE3-SLOW-NEXT: movaps %xmm3, %xmm1
200
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm6[0,3]
202
201
; SSSE3-SLOW-NEXT: retq
203
202
;
204
203
; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
205
204
; SSSE3-FAST: # %bb.0:
206
205
; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
207
206
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
208
- ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
209
- ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
210
- ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm3
207
+ ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
208
+ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
211
209
; SSSE3-FAST-NEXT: haddps %xmm4, %xmm5
212
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0]
213
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm5[3,1]
214
- ; SSSE3-FAST-NEXT: addps %xmm1, %xmm3
215
- ; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm3[0]
210
+ ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
211
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[2,0]
212
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1],xmm5[3,1]
213
+ ; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
214
+ ; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
216
215
; SSSE3-FAST-NEXT: haddps %xmm6, %xmm6
217
216
; SSSE3-FAST-NEXT: haddps %xmm7, %xmm7
218
217
; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6
219
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[2,3],xmm6[0,2]
220
- ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
218
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm6[0,2]
221
219
; SSSE3-FAST-NEXT: retq
222
220
;
223
221
; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32:
224
222
; AVX1-SLOW: # %bb.0:
225
223
; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
226
- ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[0,2,1,3]
227
224
; AVX1-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
228
- ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,1]
229
- ; AVX1-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0
230
- ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm1
231
- ; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm2
232
- ; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm3
233
- ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm4 = xmm1[0,2],xmm2[0,1]
234
- ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm3[0]
235
- ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,1]
236
- ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[1]
237
- ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm4, %xmm1
225
+ ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3],xmm1[0,1]
226
+ ; AVX1-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
227
+ ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
228
+ ; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
229
+ ; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
230
+ ; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
231
+ ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
232
+ ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
233
+ ; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
234
+ ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[1]
235
+ ; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
238
236
; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
239
237
; AVX1-SLOW-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
240
238
; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -247,34 +245,34 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
247
245
; AVX1-FAST-LABEL: pair_sum_v8f32_v4f32:
248
246
; AVX1-FAST: # %bb.0:
249
247
; AVX1-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
250
- ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
251
- ; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm1
252
- ; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm2
253
- ; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm3
254
- ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[1],xmm3[1],zero,zero
255
- ; AVX1-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm3
248
+ ; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm8
249
+ ; AVX1-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
250
+ ; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm0
251
+ ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[1],xmm0[1],zero,zero
252
+ ; AVX1-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
256
253
; AVX1-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm4
257
- ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,1]
258
- ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
259
- ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,1],xmm3[1,3]
260
- ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[1]
261
- ; AVX1-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
262
- ; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
263
- ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm1 = xmm1[1,0]
264
- ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
254
+ ; AVX1-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
255
+ ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,1]
256
+ ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]
257
+ ; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[1,3]
258
+ ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[1]
259
+ ; AVX1-FAST-NEXT: vaddps %xmm0, %xmm2, %xmm0
260
+ ; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm1 = xmm8[0],xmm0[0]
261
+ ; AVX1-FAST-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
262
+ ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
265
263
; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
266
264
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
267
- ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0 , %ymm0
268
- ; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1 [0],ymm0 [1],ymm1 [2],ymm0 [2]
265
+ ; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm1 , %ymm1
266
+ ; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm0 [0],ymm1 [1],ymm0 [2],ymm1 [2]
269
267
; AVX1-FAST-NEXT: retq
270
268
;
271
269
; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32:
272
270
; AVX2-SLOW: # %bb.0:
273
271
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
274
- ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm8 = xmm0[0,2,1,3]
275
272
; AVX2-SLOW-NEXT: vxorps %xmm1, %xmm1, %xmm1
276
- ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[0,1]
277
- ; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0
273
+ ; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3],xmm1[0,1]
274
+ ; AVX2-SLOW-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
275
+ ; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
278
276
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
279
277
; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
280
278
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
@@ -364,17 +362,17 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
364
362
; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32:
365
363
; SSSE3-SLOW: # %bb.0:
366
364
; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
367
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2 ,1,3]
368
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3 ,1,3]
365
+ ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3 ,1,3]
366
+ ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2 ,1,3]
369
367
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
370
- ; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2
371
368
; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5
372
369
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm5[2,3,2,3]
373
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm5[0,1,0,1]
374
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[1,1],xmm1[1,1]
370
+ ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm5[0,1,0,1]
371
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
372
+ ; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2
375
373
; SSSE3-SLOW-NEXT: movdqa %xmm2, %xmm1
376
374
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[2,0]
377
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3 [2,0]
375
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm4 [2,0]
378
376
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm2
379
377
; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
380
378
; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
@@ -388,12 +386,12 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
388
386
; SSSE3-FAST: # %bb.0:
389
387
; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
390
388
; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
391
- ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
392
389
; SSSE3-FAST-NEXT: movdqa %xmm5, %xmm1
393
390
; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm5
394
391
; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm4
395
392
; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm1
396
393
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm4[1,1]
394
+ ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
397
395
; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm3
398
396
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[2,0]
399
397
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[2,0]
@@ -409,20 +407,20 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
409
407
; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32:
410
408
; AVX1-SLOW: # %bb.0:
411
409
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
412
- ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
413
- ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,1,1]
414
- ; AVX1-SLOW-NEXT: vpaddd %xmm0, %xmm1, %xmm0
415
- ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm1
416
- ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,2,1,3]
417
- ; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm3
410
+ ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,1]
411
+ ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
412
+ ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
413
+ ; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
418
414
; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
419
- ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
415
+ ; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
416
+ ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,1,3]
417
+ ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
420
418
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
421
- ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm2 [0,1,2,3,4,5],xmm5[6,7]
422
- ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1 [1,3,1,1]
423
- ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1 [0,1],xmm3 [1],zero
419
+ ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3 [0,1,2,3,4,5],xmm5[6,7]
420
+ ; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm2 [1,3,1,1]
421
+ ; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm2 [0,1],xmm1 [1],zero
424
422
; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
425
- ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2 , %xmm1
423
+ ; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3 , %xmm1
426
424
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
427
425
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
428
426
; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -436,20 +434,20 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
436
434
; AVX1-FAST: # %bb.0:
437
435
; AVX1-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
438
436
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm0, %xmm8
439
- ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm1
440
- ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
441
- ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm2
442
- ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm3
437
+ ; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
438
+ ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm0
443
439
; AVX1-FAST-NEXT: vphaddd %xmm4, %xmm4, %xmm4
444
440
; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm5
445
- ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm4[0]
446
- ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm5[0,0,0,0]
447
- ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
448
- ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
449
- ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6,7]
450
- ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[1],zero
451
- ; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3]
452
- ; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
441
+ ; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
442
+ ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
443
+ ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
444
+ ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm5[0,0,0,0]
445
+ ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5],xmm3[6,7]
446
+ ; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
447
+ ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5,6,7]
448
+ ; AVX1-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm4[1],zero
449
+ ; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
450
+ ; AVX1-FAST-NEXT: vpaddd %xmm0, %xmm2, %xmm0
453
451
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm8[0],xmm0[0]
454
452
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
455
453
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -462,9 +460,9 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
462
460
; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32:
463
461
; AVX2-SLOW: # %bb.0:
464
462
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
465
- ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2 ,1,3 ]
466
- ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3 ,1,1 ]
467
- ; AVX2-SLOW-NEXT: vpaddd %xmm0 , %xmm1 , %xmm0
463
+ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3 ,1,1 ]
464
+ ; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2 ,1,3 ]
465
+ ; AVX2-SLOW-NEXT: vpaddd %xmm1 , %xmm0 , %xmm0
468
466
; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
469
467
; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
470
468
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
@@ -1147,13 +1145,13 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
1147
1145
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1148
1146
; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1149
1147
; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1150
- ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1151
1148
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1152
1149
; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1153
1150
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1154
1151
; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1155
1152
; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1156
1153
; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
1154
+ ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
1157
1155
; AVX-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1158
1156
; AVX-FAST-NEXT: retq
1159
1157
%5 = call i32 @llvm.vector.reduce.add.i32.v4i32 (<4 x i32 > %0 )
0 commit comments