@@ -20,18 +20,9 @@ define <4 x float> @pair_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
20
20
; SSSE3-SLOW-LABEL: pair_sum_v4f32_v4f32:
21
21
; SSSE3-SLOW: # %bb.0:
22
22
; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
23
- ; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
24
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm0[1,3]
25
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3,1,3]
26
- ; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
27
- ; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm2
28
- ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm1 = xmm2[1,1,3,3]
29
- ; SSSE3-SLOW-NEXT: addps %xmm2, %xmm1
30
- ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm3
31
- ; SSSE3-SLOW-NEXT: movshdup {{.*#+}} xmm2 = xmm3[1,1,3,3]
32
- ; SSSE3-SLOW-NEXT: addps %xmm3, %xmm2
33
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
34
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
23
+ ; SSSE3-SLOW-NEXT: haddps %xmm2, %xmm3
24
+ ; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm0
25
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1,3,2]
35
26
; SSSE3-SLOW-NEXT: retq
36
27
;
37
28
; SSSE3-FAST-LABEL: pair_sum_v4f32_v4f32:
@@ -106,13 +97,11 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
106
97
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,2,1,3]
107
98
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,1,3]
108
99
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
109
- ; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm2
110
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
111
- ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm1
112
- ; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm3
113
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1]
100
+ ; SSSE3-SLOW-NEXT: phaddd %xmm2, %xmm3
101
+ ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
102
+ ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,1,0,1]
103
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
114
104
; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2
115
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3]
116
105
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
117
106
; SSSE3-SLOW-NEXT: retq
118
107
;
@@ -699,23 +688,20 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
699
688
; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm4
700
689
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
701
690
; SSSE3-SLOW-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
702
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
703
- ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm1
704
- ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
691
+ ; SSSE3-SLOW-NEXT: paddd %xmm0, %xmm4
705
692
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
706
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
707
693
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm5 = xmm2[0,1,0,1]
708
694
; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm5
709
- ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm5
710
695
; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm5
711
696
; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
712
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
713
- ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm2
714
- ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm3 = xmm3[3,3,3,3]
715
- ; SSSE3-SLOW-NEXT: paddd %xmm2, %xmm3
716
- ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm3
717
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
718
- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,0]
697
+ ; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
698
+ ; SSSE3-SLOW-NEXT: paddd %xmm3, %xmm6
699
+ ; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm6
700
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3]
701
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
702
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3],xmm2[3,3]
703
+ ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3[2,0]
704
+ ; SSSE3-SLOW-NEXT: paddd %xmm4, %xmm0
719
705
; SSSE3-SLOW-NEXT: retq
720
706
;
721
707
; SSSE3-FAST-LABEL: sequential_sum_v4i32_v4i32:
@@ -724,21 +710,19 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
724
710
; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm4
725
711
; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
726
712
; SSSE3-FAST-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
727
- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
728
- ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm1
729
- ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm0
713
+ ; SSSE3-FAST-NEXT: paddd %xmm0, %xmm4
730
714
; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1
731
715
; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
732
- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
733
- ; SSSE3-FAST-NEXT: paddd %xmm1 , %xmm4
734
- ; SSSE3-FAST-NEXT: paddd %xmm2 , %xmm4
735
- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,2,3]
736
- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[3,3,3,3]
737
- ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm3
738
- ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2
739
- ; SSSE3-FAST-NEXT: paddd %xmm1, % xmm2
740
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1],xmm4 [2,3 ]
741
- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0]
716
+ ; SSSE3-FAST-NEXT: paddd % xmm2, %xmm1
717
+ ; SSSE3-FAST-NEXT: movdqa %xmm3 , %xmm5
718
+ ; SSSE3-FAST-NEXT: phaddd %xmm3 , %xmm5
719
+ ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm6 = xmm3[2,3,2,3]
720
+ ; SSSE3-FAST-NEXT: paddd %xmm5, %xmm6
721
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,1],xmm1[2,3]
722
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,1],xmm6[2,0]
723
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3], xmm2[3,3]
724
+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3],xmm3 [2,0 ]
725
+ ; SSSE3-FAST-NEXT: paddd %xmm4, % xmm0
742
726
; SSSE3-FAST-NEXT: retq
743
727
;
744
728
; AVX1-SLOW-LABEL: sequential_sum_v4i32_v4i32:
0 commit comments