@@ -761,17 +761,12 @@ define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
761
761
; SSE2: # %bb.0:
762
762
; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
763
763
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
764
- ; SSE2-NEXT: movdqa %xmm0, %xmm2
764
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
765
+ ; SSE2-NEXT: pmuludq %xmm1, %xmm0
766
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
765
767
; SSE2-NEXT: pmuludq %xmm1, %xmm2
766
- ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
767
- ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
768
- ; SSE2-NEXT: pmuludq %xmm1, %xmm3
769
- ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
770
- ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
771
- ; SSE2-NEXT: psubd %xmm2, %xmm0
772
- ; SSE2-NEXT: psrld $1, %xmm0
773
- ; SSE2-NEXT: paddd %xmm2, %xmm0
774
- ; SSE2-NEXT: psrld $2, %xmm0
768
+ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
769
+ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
775
770
; SSE2-NEXT: retq
776
771
;
777
772
; SSE41-LABEL: vector_div_leading_zeros:
@@ -780,13 +775,9 @@ define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
780
775
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
781
776
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
782
777
; SSE41-NEXT: pmuludq %xmm2, %xmm1
783
- ; SSE41-NEXT: pmuludq %xmm0, %xmm2
784
- ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
785
- ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
786
- ; SSE41-NEXT: psubd %xmm2, %xmm0
787
- ; SSE41-NEXT: psrld $1, %xmm0
788
- ; SSE41-NEXT: paddd %xmm2, %xmm0
789
- ; SSE41-NEXT: psrld $2, %xmm0
778
+ ; SSE41-NEXT: pmuludq %xmm2, %xmm0
779
+ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
780
+ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
790
781
; SSE41-NEXT: retq
791
782
;
792
783
; AVX1-LABEL: vector_div_leading_zeros:
@@ -795,13 +786,9 @@ define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
795
786
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
796
787
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
797
788
; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
798
- ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
799
- ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
800
- ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
801
- ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
802
- ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
803
- ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
804
- ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
789
+ ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
790
+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
791
+ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
805
792
; AVX1-NEXT: retq
806
793
;
807
794
; AVX2-LABEL: vector_div_leading_zeros:
@@ -810,13 +797,9 @@ define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
810
797
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
811
798
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
812
799
; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
813
- ; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
814
- ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
815
- ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
816
- ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
817
- ; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
818
- ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
819
- ; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
800
+ ; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
801
+ ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
802
+ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
820
803
; AVX2-NEXT: retq
821
804
;
822
805
; XOP-LABEL: vector_div_leading_zeros:
@@ -825,13 +808,9 @@ define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
825
808
; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
826
809
; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
827
810
; XOP-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
828
- ; XOP-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
829
- ; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
830
- ; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
831
- ; XOP-NEXT: vpsubd %xmm1, %xmm0, %xmm0
832
- ; XOP-NEXT: vpsrld $1, %xmm0, %xmm0
833
- ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm0
834
- ; XOP-NEXT: vpsrld $2, %xmm0, %xmm0
811
+ ; XOP-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
812
+ ; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
813
+ ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
835
814
; XOP-NEXT: retq
836
815
%a = and <4 x i32 > %x , <i32 255 , i32 255 , i32 255 , i32 255 >
837
816
%b = udiv <4 x i32 > %a , <i32 7 , i32 7 , i32 7 , i32 7 >
0 commit comments