@@ -889,21 +889,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
889
889
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
890
890
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2
891
891
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
892
- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
893
- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
894
- ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
895
- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
896
- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
897
- ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1
898
- ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
899
- ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1
900
- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
901
- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
902
- ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
903
- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
904
- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
905
- ; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0
906
- ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0
892
+ ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
893
+ ; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
894
+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
895
+ ; CHECK-SKX-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3
896
+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
897
+ ; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
898
+ ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
899
+ ; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
900
+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
901
+ ; CHECK-SKX-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2
902
+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
903
+ ; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
907
904
; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
908
905
; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
909
906
; CHECK-SKX-VBMI-NEXT: vzeroupper
@@ -915,25 +912,19 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
915
912
; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
916
913
; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2
917
914
; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3
918
- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
919
- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
920
- ; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4
921
- ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
922
- ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4
923
- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
924
- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
925
- ; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1
926
- ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1
927
- ; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
928
- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
929
- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
930
- ; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm4, %ymm3
931
- ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm3, %ymm3
932
- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
933
- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
934
- ; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
935
- ; CHECK-AVX512-NEXT: vpand %ymm5, %ymm0, %ymm0
936
- ; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
915
+ ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
916
+ ; CHECK-AVX512-NEXT: vpand %ymm3, %ymm4, %ymm5
917
+ ; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
918
+ ; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3
919
+ ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
920
+ ; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1
921
+ ; CHECK-AVX512-NEXT: vpternlogq $248, %ymm4, %ymm5, %ymm1
922
+ ; CHECK-AVX512-NEXT: vpand %ymm2, %ymm4, %ymm3
923
+ ; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
924
+ ; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2
925
+ ; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
926
+ ; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0
927
+ ; CHECK-AVX512-NEXT: vpternlogq $248, %ymm4, %ymm3, %ymm0
937
928
; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx)
938
929
; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx)
939
930
; CHECK-AVX512-NEXT: vzeroupper
@@ -945,21 +936,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
945
936
; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
946
937
; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2
947
938
; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
948
- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
949
- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
950
- ; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
951
- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
952
- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
953
- ; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1
954
- ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
955
- ; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1
956
- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
957
- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
958
- ; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
959
- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
960
- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
961
- ; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0
962
- ; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0
939
+ ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
940
+ ; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
941
+ ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
942
+ ; CHECK-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3
943
+ ; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
944
+ ; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
945
+ ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
946
+ ; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
947
+ ; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
948
+ ; CHECK-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2
949
+ ; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
950
+ ; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
963
951
; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
964
952
; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
965
953
; CHECK-VBMI-NEXT: vzeroupper
@@ -976,14 +964,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
976
964
; CHECK-SKX-VBMI: # %bb.0:
977
965
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
978
966
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
979
- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
980
- ; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
981
- ; CHECK-SKX-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2
982
- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
983
- ; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
984
- ; CHECK-SKX-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0
985
- ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
986
- ; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
967
+ ; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
968
+ ; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
969
+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
970
+ ; CHECK-SKX-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1
971
+ ; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
972
+ ; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
973
+ ; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
987
974
; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
988
975
; CHECK-SKX-VBMI-NEXT: vzeroupper
989
976
; CHECK-SKX-VBMI-NEXT: retq
@@ -992,16 +979,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
992
979
; CHECK-AVX512: # %bb.0:
993
980
; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
994
981
; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
995
- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
996
- ; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
997
- ; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2
998
- ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
999
- ; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2
1000
- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1001
- ; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1002
- ; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1003
- ; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm0
1004
- ; CHECK-AVX512-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
982
+ ; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
983
+ ; CHECK-AVX512-NEXT: vpandq %zmm1, %zmm2, %zmm3
984
+ ; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
985
+ ; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1
986
+ ; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
987
+ ; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0
988
+ ; CHECK-AVX512-NEXT: vpternlogq $248, %zmm2, %zmm3, %zmm0
1005
989
; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
1006
990
; CHECK-AVX512-NEXT: vzeroupper
1007
991
; CHECK-AVX512-NEXT: retq
@@ -1010,14 +994,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
1010
994
; CHECK-VBMI: # %bb.0:
1011
995
; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
1012
996
; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
1013
- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1014
- ; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1015
- ; CHECK-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2
1016
- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1017
- ; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1018
- ; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1019
- ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
1020
- ; CHECK-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
997
+ ; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
998
+ ; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
999
+ ; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
1000
+ ; CHECK-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1
1001
+ ; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
1002
+ ; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
1003
+ ; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
1021
1004
; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
1022
1005
; CHECK-VBMI-NEXT: vzeroupper
1023
1006
; CHECK-VBMI-NEXT: retq
0 commit comments