Skip to content

Commit 00775f8

Browse files
committed
[X86] Lower vXi8 multiplies using PMADDUBSW on SSSE3+ targets
Extends #95403 to handle non-constant cases - we can avoid unpacks/extensions from vXi8 to vXi16 by using PMADDUBSW instead and truncating the vXi16 results back together. Most targets would benefit from performing this for non-constant cases as well - its just Intel Core/SandyBridge era CPUs that might experience additional Port0/15 contention. Fixes #90748
1 parent 29f4a05 commit 00775f8

File tree

8 files changed

+699
-814
lines changed

8 files changed

+699
-814
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -28503,17 +28503,19 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
2850328503

2850428504
MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
2850528505

28506-
// For vXi8 mul-by-constant, try PMADDUBSW to avoid the need for extension.
28506+
// For vXi8 mul, try PMADDUBSW to avoid the need for extension.
2850728507
// Don't do this if we only need to unpack one half.
28508-
if (Subtarget.hasSSSE3() &&
28509-
ISD::isBuildVectorOfConstantSDNodes(B.getNode())) {
28510-
bool IsLoLaneAllZeroOrUndef = true;
28511-
bool IsHiLaneAllZeroOrUndef = true;
28512-
for (auto [Idx, Val] : enumerate(B->ops())) {
28513-
if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
28514-
IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
28515-
else
28516-
IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
28508+
if (Subtarget.hasSSSE3()) {
28509+
bool BIsBuildVector = isa<BuildVectorSDNode>(B);
28510+
bool IsLoLaneAllZeroOrUndef = BIsBuildVector;
28511+
bool IsHiLaneAllZeroOrUndef = BIsBuildVector;
28512+
if (BIsBuildVector) {
28513+
for (auto [Idx, Val] : enumerate(B->ops())) {
28514+
if ((Idx % NumEltsPerLane) >= (NumEltsPerLane / 2))
28515+
IsHiLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
28516+
else
28517+
IsLoLaneAllZeroOrUndef &= isNullConstantOrUndef(Val);
28518+
}
2851728519
}
2851828520
if (!(IsLoLaneAllZeroOrUndef || IsHiLaneAllZeroOrUndef)) {
2851928521
SDValue Mask = DAG.getBitcast(VT, DAG.getConstant(0x00FF, dl, ExVT));
@@ -28528,6 +28530,7 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
2852828530
}
2852928531
}
2853028532

28533+
2853128534
// Extract the lo/hi parts to any extend to i16.
2853228535
// We're going to mask off the low byte of each result element of the
2853328536
// pmullw, so it doesn't matter what's in the high byte of each 16-bit

llvm/test/CodeGen/X86/avx2-arith.ll

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -121,16 +121,14 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind readnone {
121121
define <32 x i8> @mul_v32i8(<32 x i8> %i, <32 x i8> %j) nounwind readnone {
122122
; CHECK-LABEL: mul_v32i8:
123123
; CHECK: # %bb.0:
124-
; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
125-
; CHECK-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
126-
; CHECK-NEXT: vpmullw %ymm2, %ymm3, %ymm2
127-
; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
128-
; CHECK-NEXT: vpand %ymm3, %ymm2, %ymm2
129-
; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
130-
; CHECK-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
131-
; CHECK-NEXT: vpmullw %ymm1, %ymm0, %ymm0
132-
; CHECK-NEXT: vpand %ymm3, %ymm0, %ymm0
133-
; CHECK-NEXT: vpackuswb %ymm2, %ymm0, %ymm0
124+
; CHECK-NEXT: vpbroadcastw {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
125+
; CHECK-NEXT: vpand %ymm1, %ymm2, %ymm3
126+
; CHECK-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
127+
; CHECK-NEXT: vpand %ymm2, %ymm3, %ymm3
128+
; CHECK-NEXT: vpandn %ymm1, %ymm2, %ymm1
129+
; CHECK-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
130+
; CHECK-NEXT: vpsllw $8, %ymm0, %ymm0
131+
; CHECK-NEXT: vpor %ymm0, %ymm3, %ymm0
134132
; CHECK-NEXT: ret{{[l|q]}}
135133
%x = mul <32 x i8> %i, %j
136134
ret <32 x i8> %x

llvm/test/CodeGen/X86/midpoint-int-vec-128.ll

Lines changed: 185 additions & 206 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/midpoint-int-vec-256.ll

Lines changed: 270 additions & 325 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/X86/min-legal-vector-width.ll

Lines changed: 58 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -889,21 +889,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
889889
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
890890
; CHECK-SKX-VBMI-NEXT: vmovdqa (%rsi), %ymm2
891891
; CHECK-SKX-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
892-
; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
893-
; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
894-
; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
895-
; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
896-
; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
897-
; CHECK-SKX-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1
898-
; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
899-
; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1
900-
; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
901-
; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
902-
; CHECK-SKX-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
903-
; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
904-
; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
905-
; CHECK-SKX-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0
906-
; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0
892+
; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
893+
; CHECK-SKX-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
894+
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
895+
; CHECK-SKX-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3
896+
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
897+
; CHECK-SKX-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
898+
; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
899+
; CHECK-SKX-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
900+
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
901+
; CHECK-SKX-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2
902+
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
903+
; CHECK-SKX-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
907904
; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
908905
; CHECK-SKX-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
909906
; CHECK-SKX-VBMI-NEXT: vzeroupper
@@ -915,25 +912,19 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
915912
; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1
916913
; CHECK-AVX512-NEXT: vmovdqa (%rsi), %ymm2
917914
; CHECK-AVX512-NEXT: vmovdqa 32(%rsi), %ymm3
918-
; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
919-
; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
920-
; CHECK-AVX512-NEXT: vpmullw %ymm4, %ymm5, %ymm4
921-
; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
922-
; CHECK-AVX512-NEXT: vpand %ymm5, %ymm4, %ymm4
923-
; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
924-
; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
925-
; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm1, %ymm1
926-
; CHECK-AVX512-NEXT: vpand %ymm5, %ymm1, %ymm1
927-
; CHECK-AVX512-NEXT: vpackuswb %ymm4, %ymm1, %ymm1
928-
; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
929-
; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
930-
; CHECK-AVX512-NEXT: vpmullw %ymm3, %ymm4, %ymm3
931-
; CHECK-AVX512-NEXT: vpand %ymm5, %ymm3, %ymm3
932-
; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
933-
; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
934-
; CHECK-AVX512-NEXT: vpmullw %ymm2, %ymm0, %ymm0
935-
; CHECK-AVX512-NEXT: vpand %ymm5, %ymm0, %ymm0
936-
; CHECK-AVX512-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
915+
; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
916+
; CHECK-AVX512-NEXT: vpand %ymm3, %ymm4, %ymm5
917+
; CHECK-AVX512-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
918+
; CHECK-AVX512-NEXT: vpandn %ymm3, %ymm4, %ymm3
919+
; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
920+
; CHECK-AVX512-NEXT: vpsllw $8, %ymm1, %ymm1
921+
; CHECK-AVX512-NEXT: vpternlogq $248, %ymm4, %ymm5, %ymm1
922+
; CHECK-AVX512-NEXT: vpand %ymm2, %ymm4, %ymm3
923+
; CHECK-AVX512-NEXT: vpmaddubsw %ymm3, %ymm0, %ymm3
924+
; CHECK-AVX512-NEXT: vpandn %ymm2, %ymm4, %ymm2
925+
; CHECK-AVX512-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
926+
; CHECK-AVX512-NEXT: vpsllw $8, %ymm0, %ymm0
927+
; CHECK-AVX512-NEXT: vpternlogq $248, %ymm4, %ymm3, %ymm0
937928
; CHECK-AVX512-NEXT: vmovdqa %ymm0, (%rdx)
938929
; CHECK-AVX512-NEXT: vmovdqa %ymm1, 32(%rdx)
939930
; CHECK-AVX512-NEXT: vzeroupper
@@ -945,21 +936,18 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
945936
; CHECK-VBMI-NEXT: vmovdqa 32(%rdi), %ymm1
946937
; CHECK-VBMI-NEXT: vmovdqa (%rsi), %ymm2
947938
; CHECK-VBMI-NEXT: vmovdqa 32(%rsi), %ymm3
948-
; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm3[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
949-
; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
950-
; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
951-
; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
952-
; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
953-
; CHECK-VBMI-NEXT: vpmullw %ymm3, %ymm1, %ymm1
954-
; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
955-
; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm1
956-
; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
957-
; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} ymm5 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
958-
; CHECK-VBMI-NEXT: vpmullw %ymm4, %ymm5, %ymm4
959-
; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
960-
; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
961-
; CHECK-VBMI-NEXT: vpmullw %ymm2, %ymm0, %ymm0
962-
; CHECK-VBMI-NEXT: vpermt2b %ymm4, %ymm3, %ymm0
939+
; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
940+
; CHECK-VBMI-NEXT: vpandn %ymm3, %ymm4, %ymm5
941+
; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm1, %ymm5
942+
; CHECK-VBMI-NEXT: vpand %ymm3, %ymm4, %ymm3
943+
; CHECK-VBMI-NEXT: vpmaddubsw %ymm3, %ymm1, %ymm1
944+
; CHECK-VBMI-NEXT: vmovdqa {{.*#+}} ymm3 = [0,32,2,34,4,36,6,38,8,40,10,42,12,44,14,46,16,48,18,50,20,52,22,54,24,56,26,58,28,60,30,62]
945+
; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm1
946+
; CHECK-VBMI-NEXT: vpandn %ymm2, %ymm4, %ymm5
947+
; CHECK-VBMI-NEXT: vpmaddubsw %ymm5, %ymm0, %ymm5
948+
; CHECK-VBMI-NEXT: vpand %ymm2, %ymm4, %ymm2
949+
; CHECK-VBMI-NEXT: vpmaddubsw %ymm2, %ymm0, %ymm0
950+
; CHECK-VBMI-NEXT: vpermt2b %ymm5, %ymm3, %ymm0
963951
; CHECK-VBMI-NEXT: vmovdqa %ymm0, (%rdx)
964952
; CHECK-VBMI-NEXT: vmovdqa %ymm1, 32(%rdx)
965953
; CHECK-VBMI-NEXT: vzeroupper
@@ -976,14 +964,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
976964
; CHECK-SKX-VBMI: # %bb.0:
977965
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
978966
; CHECK-SKX-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
979-
; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
980-
; CHECK-SKX-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
981-
; CHECK-SKX-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2
982-
; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
983-
; CHECK-SKX-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
984-
; CHECK-SKX-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0
985-
; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
986-
; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
967+
; CHECK-SKX-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
968+
; CHECK-SKX-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
969+
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
970+
; CHECK-SKX-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1
971+
; CHECK-SKX-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
972+
; CHECK-SKX-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
973+
; CHECK-SKX-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
987974
; CHECK-SKX-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
988975
; CHECK-SKX-VBMI-NEXT: vzeroupper
989976
; CHECK-SKX-VBMI-NEXT: retq
@@ -992,16 +979,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
992979
; CHECK-AVX512: # %bb.0:
993980
; CHECK-AVX512-NEXT: vmovdqa64 (%rdi), %zmm0
994981
; CHECK-AVX512-NEXT: vmovdqa64 (%rsi), %zmm1
995-
; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
996-
; CHECK-AVX512-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
997-
; CHECK-AVX512-NEXT: vpmullw %zmm2, %zmm3, %zmm2
998-
; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
999-
; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm2, %zmm2
1000-
; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1001-
; CHECK-AVX512-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1002-
; CHECK-AVX512-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1003-
; CHECK-AVX512-NEXT: vpandq %zmm3, %zmm0, %zmm0
1004-
; CHECK-AVX512-NEXT: vpackuswb %zmm2, %zmm0, %zmm0
982+
; CHECK-AVX512-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
983+
; CHECK-AVX512-NEXT: vpandq %zmm1, %zmm2, %zmm3
984+
; CHECK-AVX512-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
985+
; CHECK-AVX512-NEXT: vpandnq %zmm1, %zmm2, %zmm1
986+
; CHECK-AVX512-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
987+
; CHECK-AVX512-NEXT: vpsllw $8, %zmm0, %zmm0
988+
; CHECK-AVX512-NEXT: vpternlogq $248, %zmm2, %zmm3, %zmm0
1005989
; CHECK-AVX512-NEXT: vmovdqa64 %zmm0, (%rdx)
1006990
; CHECK-AVX512-NEXT: vzeroupper
1007991
; CHECK-AVX512-NEXT: retq
@@ -1010,14 +994,13 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
1010994
; CHECK-VBMI: # %bb.0:
1011995
; CHECK-VBMI-NEXT: vmovdqa64 (%rdi), %zmm0
1012996
; CHECK-VBMI-NEXT: vmovdqa64 (%rsi), %zmm1
1013-
; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm2 = zmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1014-
; CHECK-VBMI-NEXT: vpunpckhbw {{.*#+}} zmm3 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
1015-
; CHECK-VBMI-NEXT: vpmullw %zmm2, %zmm3, %zmm2
1016-
; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1017-
; CHECK-VBMI-NEXT: vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
1018-
; CHECK-VBMI-NEXT: vpmullw %zmm1, %zmm0, %zmm0
1019-
; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94,32,34,36,38,40,42,44,46,96,98,100,102,104,106,108,110,48,50,52,54,56,58,60,62,112,114,116,118,120,122,124,126]
1020-
; CHECK-VBMI-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
997+
; CHECK-VBMI-NEXT: vpbroadcastw {{.*#+}} zmm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
998+
; CHECK-VBMI-NEXT: vpandnq %zmm1, %zmm2, %zmm3
999+
; CHECK-VBMI-NEXT: vpmaddubsw %zmm3, %zmm0, %zmm3
1000+
; CHECK-VBMI-NEXT: vpandq %zmm1, %zmm2, %zmm1
1001+
; CHECK-VBMI-NEXT: vpmaddubsw %zmm1, %zmm0, %zmm0
1002+
; CHECK-VBMI-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,64,2,66,4,68,6,70,8,72,10,74,12,76,14,78,16,80,18,82,20,84,22,86,24,88,26,90,28,92,30,94,32,96,34,98,36,100,38,102,40,104,42,106,44,108,46,110,48,112,50,114,52,116,54,118,56,120,58,122,60,124,62,126]
1003+
; CHECK-VBMI-NEXT: vpermi2b %zmm3, %zmm0, %zmm1
10211004
; CHECK-VBMI-NEXT: vmovdqa64 %zmm1, (%rdx)
10221005
; CHECK-VBMI-NEXT: vzeroupper
10231006
; CHECK-VBMI-NEXT: retq

0 commit comments

Comments
 (0)