-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) on vXi8 vectors #143359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
…ctors Undo the vectorcombine canonicalisation as SSE has awful vXi8 shift support, but can easily splat the MSB using the PCMPGTB(0,x) trick. Alternative to llvm#143106 which could cause infinite loops between srl/sra conversions Fixes llvm#130549
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesUndo the vectorcombine canonicalisation as SSE has awful vXi8 shift support, but can easily splat the MSB using the PCMPGTB(0,x) trick. Alternative to #143106 which could cause infinite loops between srl/sra conversions Fixes #130549 Patch is 109.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/143359.diff 4 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b59d4a8618220..49a06d31802d0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -58117,21 +58117,31 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
}
}
- // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
- // (sub Y, (sext (vXi1 X))).
- // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in
- // generic DAG combine without a legal type check, but adding this there
- // caused regressions.
if (VT.isVector()) {
SDValue X, Y;
EVT BoolVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
VT.getVectorElementCount());
+
+ // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into
+ // (sub Y, (sext (vXi1 X))).
+ // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y)
+ // in generic DAG combine without a legal type check, but adding this there
+ // caused regressions.
if (DAG.getTargetLoweringInfo().isTypeLegal(BoolVT) &&
sd_match(N, m_Add(m_ZExt(m_AllOf(m_SpecificVT(BoolVT), m_Value(X))),
m_Value(Y)))) {
SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, X);
return DAG.getNode(ISD::SUB, DL, VT, Y, SExt);
}
+
+ // Fold (add X, (srl Y, 7)) -> (sub X, (icmp_sgt 0, Y)) to undo instcombine
+ // canonicalisation as we don't have good vXi8 shifts.
+ if (VT.getScalarType() == MVT::i8 &&
+ sd_match(N, m_Add(m_Value(X), m_Srl(m_Value(Y), m_SpecificInt(7))))) {
+ SDValue Cmp =
+ DAG.getSetCC(DL, BoolVT, DAG.getConstant(0, DL, VT), Y, ISD::SETGT);
+ return DAG.getNode(ISD::SUB, DL, VT, X, DAG.getSExtOrTrunc(Cmp, DL, VT));
+ }
}
// Peephole for 512-bit VPDPBSSD on non-VLX targets.
diff --git a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index cfa71ff00aa67..816d5cace033a 100644
--- a/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/llvm/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -174,20 +174,19 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; SSE-NEXT: pmulhw %xmm3, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: pmulhw %xmm3, %xmm1
-; SSE-NEXT: psrlw $8, %xmm1
-; SSE-NEXT: packuswb %xmm2, %xmm1
-; SSE-NEXT: paddb %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: psrlw $2, %xmm1
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; SSE-NEXT: pxor %xmm2, %xmm1
-; SSE-NEXT: psrlw $7, %xmm0
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE-NEXT: pmulhw %xmm3, %xmm4
+; SSE-NEXT: psrlw $8, %xmm4
+; SSE-NEXT: packuswb %xmm2, %xmm4
+; SSE-NEXT: paddb %xmm4, %xmm0
+; SSE-NEXT: psrlw $2, %xmm0
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE-NEXT: pxor %xmm2, %xmm0
; SSE-NEXT: psubb %xmm2, %xmm0
+; SSE-NEXT: pcmpgtb %xmm0, %xmm1
+; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_div7_16i8:
@@ -197,19 +196,18 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpsrlw $2, %xmm0, %xmm0
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpsubb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2NOBW-LABEL: test_div7_16i8:
@@ -220,14 +218,14 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpsrlw $2, %xmm0, %xmm0
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX2NOBW-NEXT: vpsubb %xmm2, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2NOBW-NEXT: vzeroupper
; AVX2NOBW-NEXT: retq
;
@@ -238,14 +236,14 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
; AVX512BW-NEXT: vpsrlw $8, %ymm1, %ymm1
; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm1
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX512BW-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsrlw $2, %xmm0, %xmm0
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX512BW-NEXT: vpsubb %xmm2, %xmm0, %xmm0
+; AVX512BW-NEXT: vpbroadcastb {{.*#+}} xmm1 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX512BW-NEXT: vpxor %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%res = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
@@ -264,26 +262,25 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
-; SSE-NEXT: psrlw $8, %xmm1
-; SSE-NEXT: packuswb %xmm2, %xmm1
+; SSE-NEXT: pxor %xmm3, %xmm3
+; SSE-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT: pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; SSE-NEXT: psrlw $8, %xmm3
+; SSE-NEXT: packuswb %xmm2, %xmm3
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: paddb %xmm1, %xmm0
-; SSE-NEXT: movdqa %xmm0, %xmm1
-; SSE-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
-; SSE-NEXT: psraw $8, %xmm1
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [32,64,128,32,64,128,128,64]
-; SSE-NEXT: psrlw $8, %xmm1
+; SSE-NEXT: paddb %xmm3, %xmm0
; SSE-NEXT: movdqa %xmm0, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
; SSE-NEXT: psraw $8, %xmm2
-; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [64,64,128,64,32,128,64,32]
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [32,64,128,32,64,128,128,64]
; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: packuswb %xmm1, %xmm2
-; SSE-NEXT: psrlw $7, %xmm0
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT: paddb %xmm2, %xmm0
+; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT: psraw $8, %xmm0
+; SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,64,128,64,32,128,64,32]
+; SSE-NEXT: psrlw $8, %xmm0
+; SSE-NEXT: packuswb %xmm2, %xmm0
+; SSE-NEXT: pcmpgtb %xmm0, %xmm1
+; SSE-NEXT: psubb %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_divconstant_16i8:
@@ -292,24 +289,23 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [37632,20224,11008,47872,26368,14592,14592,37632]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [37632,33024,14592,26368,47872,11008,20224,37632]
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3 # [37632,33024,14592,26368,47872,11008,20224,37632]
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [32,64,128,32,64,128,128,64]
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [64,64,128,64,32,128,64,32]
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [32,64,128,32,64,128,128,64]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsrlw $7, %xmm0, %xmm0
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpsraw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,64,128,64,32,128,64,32]
+; AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX2NOBW-LABEL: test_divconstant_16i8:
@@ -321,14 +317,14 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
-; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm1
-; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [64,64,128,64,32,128,64,32,32,64,128,32,64,128,128,64]
-; AVX2NOBW-NEXT: vpsrlw $8, %ymm1, %ymm1
-; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
-; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpsrlw $7, %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX2NOBW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX2NOBW-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,64,128,64,32,128,64,32,32,64,128,32,64,128,128,64]
+; AVX2NOBW-NEXT: vpsrlw $8, %ymm0, %ymm0
+; AVX2NOBW-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX2NOBW-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
+; AVX2NOBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX2NOBW-NEXT: vzeroupper
; AVX2NOBW-NEXT: retq
;
@@ -341,12 +337,12 @@ define <16 x i8> @test_divconstant_16i8(<16 x i8> %a) nounwind {
; AVX512BW-NEXT: vpmovwb %zmm2, %ymm2
; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vpaddb %xmm0, %xmm2, %xmm0
-; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm2
-; AVX512BW-NEXT: vpsravw %zmm1, %zmm2, %zmm1
-; AVX512BW-NEXT: vpmovwb %zmm1, %ymm1
-; AVX512BW-NEXT: vpsrlw $7, %xmm0, %xmm0
-; AVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX512BW-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT: vpmovsxbw %xmm0, %ymm0
+; AVX512BW-NEXT: vpsravw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT: vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512BW-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm1
+; AVX512BW-NEXT: vpsubb %xmm1, %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
%res = sdiv <16 x i8> %a, <i8 7, i8 8, i8 9, i8 10,i8 11, i8 12, i8 13, i8 14, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9,i8 9, i8 7>
@@ -568,25 +564,24 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; SSE-NEXT: movdqa {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; SSE-NEXT: pmulhw %xmm3, %xmm2
; SSE-NEXT: psrlw $8, %xmm2
-; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT: pmulhw %xmm3, %xmm1
-; SSE-NEXT: psrlw $8, %xmm1
-; SSE-NEXT: packuswb %xmm2, %xmm1
-; SSE-NEXT: paddb %xmm0, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psrlw $2, %xmm2
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: movdqa {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; SSE-NEXT: pxor %xmm3, %xmm2
-; SSE-NEXT: psrlw $7, %xmm1
+; SSE-NEXT: pxor %xmm4, %xmm4
+; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE-NEXT: pmulhw %xmm3, %xmm4
+; SSE-NEXT: psrlw $8, %xmm4
+; SSE-NEXT: packuswb %xmm2, %xmm4
+; SSE-NEXT: paddb %xmm0, %xmm4
+; SSE-NEXT: psrlw $2, %xmm4
+; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; SSE-NEXT: movdqa {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; SSE-NEXT: pxor %xmm2, %xmm4
+; SSE-NEXT: psubb %xmm2, %xmm4
+; SSE-NEXT: pcmpgtb %xmm4, %xmm1
+; SSE-NEXT: psubb %xmm1, %xmm4
+; SSE-NEXT: movdqa %xmm4, %xmm1
+; SSE-NEXT: psllw $3, %xmm1
; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE-NEXT: paddb %xmm2, %xmm1
-; SSE-NEXT: psubb %xmm3, %xmm1
-; SSE-NEXT: movdqa %xmm1, %xmm2
-; SSE-NEXT: psllw $3, %xmm2
-; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT: psubb %xmm2, %xmm1
-; SSE-NEXT: paddb %xmm1, %xmm0
+; SSE-NEXT: psubb %xmm1, %xmm4
+; SSE-NEXT: paddb %xmm4, %xmm0
; SSE-NEXT: retq
;
; AVX1-LABEL: test_rem7_16i8:
@@ -596,19 +591,18 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [37632,37632,37632,37632,37632,37632,37632,37632]
; AVX1-NEXT: vpmulhw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; AVX1-NEXT: vpmulhw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1
-; AVX1-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm0, %xmm1, %xmm1
-; AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2
+; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; AVX1-NEXT: vpmulhw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
+; AVX1-NEXT: vpackuswb %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpaddb %xmm0, %xmm2, %xmm2
+; AVX1-NEXT: vpsrlw $2, %xmm2, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vbroadcastss {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpsrlw $7, %xmm1, %xmm1
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vpcmpgtb %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsubb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; AVX1-NEXT: vpsubb %xmm2, %xmm1, %xmm1
@@ -623,14 +617,14 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
; AVX2NOBW-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2NOBW-NEXT: vpackuswb %xmm2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpaddb %xmm0, %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm2
-; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm3 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
-; AVX2NOBW-NEXT: vpxor %xmm3, %xmm2, %xmm2
-; AVX2NOBW-NEXT: vpsrlw $7, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsrlw $2, %xmm1, %xmm1
; AVX2NOBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2NOBW-NEXT: vpaddb %xmm1, %xmm2, %xmm1
-; AVX2NOBW-NEXT: vpsubb %xmm3, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpbroadcastb {{.*#+}} xmm2 = [32,32,32,32,32,32,32,32,32,32,32,32,32,32,32,32]
+; AVX2NOBW-NEXT: vpxor %xmm2, %xmm1, %xmm1
+; AVX2NOBW-NEXT: vpsubb %xmm2,...
[truncated]
|
…ctors (llvm#143359) Undo the vectorcombine canonicalisation as SSE has awful vXi8 shift support, but can easily splat the MSB using the PCMPGTB(0,x) trick. Alternative to llvm#143106 which could cause infinite loops between srl/sra conversions Fixes llvm#130549
…ctors (llvm#143359) Undo the vectorcombine canonicalisation as SSE has awful vXi8 shift support, but can easily splat the MSB using the PCMPGTB(0,x) trick. Alternative to llvm#143106 which could cause infinite loops between srl/sra conversions Fixes llvm#130549
…ctors (llvm#143359) Undo the vectorcombine canonicalisation as SSE has awful vXi8 shift support, but can easily splat the MSB using the PCMPGTB(0,x) trick. Alternative to llvm#143106 which could cause infinite loops between srl/sra conversions Fixes llvm#130549
Undo the vectorcombine canonicalisation as SSE has awful vXi8 shift support, but can easily splat the MSB using the PCMPGTB(0,x) trick.
Alternative to #143106 which could cause infinite loops between srl/sra conversions
Fixes #130549