-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] combineConcatVectorOps - add concatenation handling for BITCAST nodes #133913
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
9e545ed
to
22edba4
Compare
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesThese nodes are effectively free, so we should only concatenate if the inner nodes will concatenate together. This also exposed a regression in canonicalizeShuffleWithOp that failed to realize it could potentially merge shuffles with a CONCAT_VECTORS node. Patch is 595.64 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133913.diff 13 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a3c423270f44a..1a771ecd651ea 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41704,6 +41704,7 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
(Op.getOpcode() == Opc && Op->hasOneUse()) ||
(Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
+ (Op.getOpcode() == ISD::CONCAT_VECTORS && Op->hasOneUse()) ||
(FoldShuf && isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
DAG.isSplatValue(Op, /*AllowUndefs*/ false);
};
@@ -58134,6 +58135,40 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
unsigned Opcode = Op0.getOpcode();
switch (Opcode) {
+ case ISD::AssertSext:
+ case ISD::AssertZext: {
+ if (!IsSplat && llvm::all_of(Ops, [Op0](SDValue Op) {
+ return Op0.getOperand(1) == Op.getOperand(1);
+ }))
+ if (SDValue ConcatSrc =
+ combineConcatVectorOps(DL, VT, Ops, DAG, Subtarget, Depth + 1))
+ return DAG.getNode(Opcode, DL, VT, ConcatSrc, Op0.getOperand(1));
+ break;
+ }
+ case ISD::BITCAST: {
+ // TODO: Support AVX1/AVX2 bitcasts.
+ SmallVector<SDValue, 4> SubOps;
+ for (SDValue SubOp : Ops)
+ SubOps.push_back(peekThroughBitcasts(SubOp.getOperand(0)));
+ EVT InnerVT = SubOps[0].getValueType();
+ unsigned InnerSizeInBits = InnerVT.getScalarSizeInBits();
+ if (!IsSplat && InnerVT.isSimple() && InnerVT.isVector() &&
+ (Subtarget.hasBWI() ||
+ (EltSizeInBits >= 32 && InnerSizeInBits >= 32)) &&
+ ((VT.is256BitVector() && Subtarget.hasVLX()) ||
+ (VT.is512BitVector() && Subtarget.useAVX512Regs())) &&
+ llvm::all_of(SubOps, [InnerVT](SDValue Op) {
+ return Op.getValueType() == InnerVT;
+ })) {
+ MVT ConcatSVT = InnerVT.getScalarType().getSimpleVT();
+ MVT ConcatVT = MVT::getVectorVT(
+ ConcatSVT, VT.getSizeInBits() / ConcatSVT.getSizeInBits());
+ if (SDValue ConcatSrc = combineConcatVectorOps(
+ DL, ConcatVT, SubOps, DAG, Subtarget, Depth + 1))
+ return DAG.getBitcast(VT, ConcatSrc);
+ }
+ break;
+ }
case ISD::VECTOR_SHUFFLE: {
// TODO: Generalize NumOps support.
if (!IsSplat && NumOps == 2 &&
diff --git a/llvm/test/CodeGen/X86/shift-i512.ll b/llvm/test/CodeGen/X86/shift-i512.ll
index f7dd1dc0949f5..c7da04171e6a1 100644
--- a/llvm/test/CodeGen/X86/shift-i512.ll
+++ b/llvm/test/CodeGen/X86/shift-i512.ll
@@ -123,20 +123,18 @@ define <8 x i64> @lshr_i512_1(<8 x i64> %a) {
;
; AVX512VBMI-LABEL: lshr_i512_1:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $63, %xmm5, %xmm3, %xmm3
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $63, %xmm2, %xmm1, %xmm2
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX512VBMI-NEXT: vpsrlq $1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512VBMI-NEXT: vpsrlq $1, %xmm2, %xmm2
+; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
+; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
@@ -238,20 +236,18 @@ define <8 x i64> @ashr_i512_1(<8 x i64> %a) {
;
; AVX512VBMI-LABEL: ashr_i512_1:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm1
-; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm2
-; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm3
+; AVX512VBMI-NEXT: vextracti128 $1, %ymm0, %xmm1
+; AVX512VBMI-NEXT: vextracti32x4 $3, %zmm0, %xmm2
+; AVX512VBMI-NEXT: vextracti32x4 $2, %zmm0, %xmm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
; AVX512VBMI-NEXT: vpshldq $63, %xmm4, %xmm2, %xmm4
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $63, %xmm5, %xmm3, %xmm3
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
-; AVX512VBMI-NEXT: vpshldq $63, %xmm2, %xmm1, %xmm2
-; AVX512VBMI-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX512VBMI-NEXT: vpsraq $1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm1, %zmm3, %zmm1
+; AVX512VBMI-NEXT: vpsraq $1, %xmm2, %xmm2
+; AVX512VBMI-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2
+; AVX512VBMI-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
+; AVX512VBMI-NEXT: vpshufd {{.*#+}} ymm3 = ymm0[2,3,2,3,6,7,6,7]
+; AVX512VBMI-NEXT: vpshldq $63, %ymm3, %ymm1, %ymm1
+; AVX512VBMI-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1
; AVX512VBMI-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[2,3,2,3,6,7,6,7,10,11,10,11,14,15,14,15]
; AVX512VBMI-NEXT: vpshldq $63, %zmm0, %zmm2, %zmm0
; AVX512VBMI-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6]
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index ec442c185706c..e27a77ed2293d 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -297,23 +297,21 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
;
; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
-; AVX512BW-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT: vpermt2d %zmm2, %zmm1, %zmm0
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
+; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u,33,37,41,45,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,49,53,57,62,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
; AVX512BWVL: # %bb.0:
-; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
+; AVX512BWVL-NEXT: vpmovsxbd {{.*#+}} xmm1 = [8,12,0,5]
; AVX512BWVL-NEXT: vextracti64x4 $1, %zmm0, %ymm2
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BWVL-NEXT: vpermt2d %ymm2, %ymm1, %ymm0
-; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
+; AVX512BWVL-NEXT: vinserti64x4 $1, %ymm0, %zmm2, %zmm0
+; AVX512BWVL-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u,33,37,41,45,u,u,u,u,u,u,u,u,u,u,u,u,49,53,57,61,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512BWVL-NEXT: vpermd %zmm0, %zmm1, %zmm0
+; AVX512BWVL-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512BWVL-NEXT: vzeroupper
; AVX512BWVL-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 35c707eac83b4..0fa2c858ff000 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1228,13 +1228,14 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94]
-; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm4, %ymm4
-; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX512VBMI2-NEXT: vpsrlw %xmm2, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpermt2b %zmm4, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95]
+; AVX512VBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3
+; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
+; AVX512VBMI2-NEXT: vpsrlw %xmm0, %zmm3, %zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
+; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm1, %zmm0
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
@@ -1251,16 +1252,29 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %
; AVX512VLBW-NEXT: vpackuswb %ymm3, %ymm0, %ymm0
; AVX512VLBW-NEXT: retq
;
-; AVX10-LABEL: splatvar_funnnel_v32i8:
-; AVX10: # %bb.0:
-; AVX10-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
-; AVX10-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX10-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
-; AVX10-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
-; AVX10-NEXT: vpsrlw %xmm2, %ymm0, %ymm1
-; AVX10-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
-; AVX10-NEXT: vpermi2b %ymm3, %ymm1, %ymm0
-; AVX10-NEXT: retq
+; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
+; AVX512VLVBMI2: # %bb.0:
+; AVX512VLVBMI2-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
+; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95,0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87]
+; AVX512VLVBMI2-NEXT: vpermi2b %zmm0, %zmm1, %zmm3
+; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm0, %zmm3, %zmm0
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm1 = [32,34,36,38,40,42,44,46,0,2,4,6,8,10,12,14,48,50,52,54,56,58,60,62,16,18,20,22,24,26,28,30]
+; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm1, %zmm0
+; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
+; AVX512VLVBMI2-NEXT: retq
+;
+; AVX10_256-LABEL: splatvar_funnnel_v32i8:
+; AVX10_256: # %bb.0:
+; AVX10_256-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; AVX10_256-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX10_256-NEXT: vpsrlw %xmm2, %ymm3, %ymm3
+; AVX10_256-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23]
+; AVX10_256-NEXT: vpsrlw %xmm2, %ymm0, %ymm1
+; AVX10_256-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
+; AVX10_256-NEXT: vpermi2b %ymm3, %ymm1, %ymm0
+; AVX10_256-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
; XOPAVX1: # %bb.0:
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index 5cac190eae690..3d4f283260aa5 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -992,25 +992,26 @@ define <32 x i8> @splatvar_funnnel_v32i8(<32 x i8> %x, <32 x i8> %amt) nounwind
;
; AVX512VBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VBMI2: # %bb.0:
-; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78,16,18,20,22,24,26,28,30,80,82,84,86,88,90,92,94]
-; AVX512VBMI2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
+; AVX512VBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm3, %zmm0
; AVX512VBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512VBMI2-NEXT: vpsrlw %xmm1, %ymm3, %ymm3
-; AVX512VBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0
-; AVX512VBMI2-NEXT: vpermt2b %zmm3, %zmm2, %zmm0
+; AVX512VBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0
; AVX512VBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VBMI2-NEXT: retq
;
; AVX512VLVBMI2-LABEL: splatvar_funnnel_v32i8:
; AVX512VLVBMI2: # %bb.0:
-; AVX512VLVBMI2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
+; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
+; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm2 = [32,34,36,38,40,42,44,46,0,2,4,6,8,10,12,14,48,50,52,54,56,58,60,62,16,18,20,22,24,26,28,30]
+; AVX512VLVBMI2-NEXT: vmovdqa64 {{.*#+}} zmm3 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
+; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm3, %zmm0
; AVX512VLVBMI2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm2, %ymm2
-; AVX512VLVBMI2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
-; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %ymm0, %ymm1
-; AVX512VLVBMI2-NEXT: vmovdqa {{.*#+}} ymm0 = [0,2,4,6,8,10,12,14,32,34,36,38,40,42,44,46,16,18,20,22,24,26,28,30,48,50,52,54,56,58,60,62]
-; AVX512VLVBMI2-NEXT: vpermi2b %ymm2, %ymm1, %ymm0
+; AVX512VLVBMI2-NEXT: vpsrlw %xmm1, %zmm0, %zmm0
+; AVX512VLVBMI2-NEXT: vpermb %zmm0, %zmm2, %zmm0
+; AVX512VLVBMI2-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512VLVBMI2-NEXT: retq
;
; XOPAVX1-LABEL: splatvar_funnnel_v32i8:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index fc4377a08d560..3e04c2c8120cc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -130,13 +130,14 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512BW-FCP-LABEL: store_i16_stride4_vf2:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
-; AVX512BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
-; AVX512BW-FCP-NEXT: vmovdqa %xmm2, (%r8)
+; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm1
+; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,24,0,16,9,25,1,17]
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512BW-FCP-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0
+; AVX512BW-FCP-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
+; AVX512BW-FCP-NEXT: vmovdqa %xmm0, (%r8)
+; AVX512BW-FCP-NEXT: vzeroupper
; AVX512BW-FCP-NEXT: retq
;
; AVX512DQ-BW-LABEL: store_i16_stride4_vf2:
@@ -152,13 +153,14 @@ define void @store_i16_stride4_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-BW-FCP-LABEL: store_i16_stride4_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
-; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
-; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [0,2,8,10,1,3,9,11]
-; AVX512DQ-BW-FCP-NEXT: vpermi2w %xmm1, %xmm0, %xmm2
-; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm1
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm2 = [8,24,0,16,9,25,1,17]
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%rdi), %ymm0, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm1, %ymm2, %ymm0
+; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, (%r8)
+; AVX512DQ-BW-FCP-NEXT: vzeroupper
; AVX512DQ-BW-FCP-NEXT: retq
%in.vec0 = load <2 x i16>, ptr %in.vecptr0, align 64
%in.vec1 = load <2 x i16>, ptr %in.vecptr1, align 64
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index 7b619344e83f6..3767f1d68dfb7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7....
[truncated]
|
… nodes These nodes are effectively free. For bitcasts we should only concatenate if the inner nodes will concatenate together. This also exposed a regression in canonicalizeShuffleWithOp that failed to realise it could potentially merge shuffles with a CONCAT_VECTORS node.
22edba4
to
f184621
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
It seems like this is causing a crash when trying to link one of chromium's testing binaries. Unfortunately I'm not sure how to look for a minimal reproduction at the moment, but I can point to a failing build: https://ci.chromium.org/ui/p/chromium/builders/ci/CFI%20Linux%20ToT/45085/overview |
Cheers, that's not much to go on I'm afraid. I can't tell if it's the patch's fault or if it's exposed an existing problem. Any luck with a reduced repro? A full call stack at the assertion might be enough if that's available. |
I've uploaded an unreduced repro here: https://crbug.com/409794994#comment4 I'll see if I can turn that into something smaller.. |
My guess is getConstVector is seeing a fp type other than float/double which it handles by default - I'll push a quick fix for you to test |
… getNode/FoldConstantArithmetic getConstVector could only handle f32/f64 vector element types from raw APInt bit data - instead of trying to add all supported fp types, just bitcast the integer equivalent and leave it getNode/FoldConstantArithmetic to perform the constant bitcast conversion Tentative fix for a regression reported after llvm#133913
… getNode/FoldConstantArithmetic (#135337) getConstVector could only handle f32/f64 vector element types from raw APInt bit data - instead of trying to add all supported fp types, just bitcast the integer equivalent and leave it to getNode/FoldConstantArithmetic to perform the constant bitcast conversion Tentative fix for a regression reported after #133913
Uploaded a single-function reproducer at https://crbug.com/409794994#comment5 |
Everything looks good at that revision. Thanks for the quick fix! |
… nodes (llvm#133913) These nodes are effectively free, so we should only concatenate if the inner nodes will concatenate together. This also exposed a regression in canonicalizeShuffleWithOp that failed to realize it could potentially merge shuffles with a CONCAT_VECTORS node.
… getNode/FoldConstantArithmetic (llvm#135337) getConstVector could only handle f32/f64 vector element types from raw APInt bit data - instead of trying to add all supported fp types, just bitcast the integer equivalent and leave it to getNode/FoldConstantArithmetic to perform the constant bitcast conversion Tentative fix for a regression reported after llvm#133913
These nodes are effectively free, so we should only concatenate if the inner nodes will concatenate together.
This also exposed a regression in canonicalizeShuffleWithOp that failed to realize it could potentially merge shuffles with a CONCAT_VECTORS node.