Skip to content

Commit 1070f6b

Browse files
committed
[X86] SimplifyDemandedVectorEltsForTargetNode - reduce the size of VPERMV/VPERMV3 nodes if the upper elements are not demanded
With AVX512VL targets, use 128/256-bit VPERMV/VPERMV3 nodes when we only need the lower elements. This exposed an issue with VPERMV3(X,M,Y) -> VPERMV(M,CONCAT(X,Y)) folds when X==Y, so I had to move that fold after the other VPERMV3 folds/canonicalizations. I also took the opportunity to try to support the VPERMV(M,CONCAT(Y,X)) case as well, but we can revert this if we'd prefer to avoid the extra VSHUFF64X2 node for non-constant shuffle masks (but separate loads) instead.
1 parent 2426ac6 commit 1070f6b

10 files changed

+256
-182
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 94 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -42675,40 +42675,10 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4267542675
return SDValue();
4267642676
}
4267742677
case X86ISD::VPERMV3: {
42678-
// Combine VPERMV3 to widened VPERMV if the two source operands can be
42679-
// freely concatenated.
42680-
MVT WideVT = VT.getDoubleNumVectorElementsVT();
42681-
bool CanConcat = VT.is128BitVector() ||
42682-
(VT.is256BitVector() && Subtarget.useAVX512Regs());
42683-
if (CanConcat) {
42684-
SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
42685-
if (SDValue ConcatSrc =
42686-
combineConcatVectorOps(DL, WideVT, Ops, DAG, Subtarget)) {
42687-
SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
42688-
DL, WideVT.getSizeInBits());
42689-
SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
42690-
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
42691-
DAG.getVectorIdxConstant(0, DL));
42692-
}
42693-
}
4269442678
SmallVector<SDValue, 2> SrcOps;
4269542679
SmallVector<int, 32> Mask;
4269642680
if (getTargetShuffleMask(N, /*AllowSentinelZero=*/false, SrcOps, Mask)) {
4269742681
assert(Mask.size() == NumElts && "Unexpected shuffle mask size");
42698-
// See if we can concatenate the commuted operands.
42699-
if (CanConcat) {
42700-
if (SDValue ConcatSrc = combineConcatVectorOps(
42701-
DL, WideVT, {N.getOperand(2), N.getOperand(0)}, DAG,
42702-
Subtarget)) {
42703-
ShuffleVectorSDNode::commuteMask(Mask);
42704-
Mask.append(NumElts, SM_SentinelUndef);
42705-
SDValue Perm =
42706-
lowerShuffleWithPERMV(DL, WideVT, Mask, ConcatSrc,
42707-
DAG.getUNDEF(WideVT), Subtarget, DAG);
42708-
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
42709-
DAG.getVectorIdxConstant(0, DL));
42710-
}
42711-
}
4271242682
SDValue V1 = peekThroughBitcasts(N.getOperand(0));
4271342683
SDValue V2 = peekThroughBitcasts(N.getOperand(2));
4271442684
// Canonicalize to VPERMV if both sources are the same.
@@ -42743,6 +42713,37 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
4274342713
N.getOperand(0), Subtarget, DAG);
4274442714
}
4274542715
}
42716+
// Combine VPERMV3 to widened VPERMV if the two source operands can be
42717+
// freely concatenated.
42718+
MVT WideVT = VT.getDoubleNumVectorElementsVT();
42719+
if (VT.is128BitVector() ||
42720+
(VT.is256BitVector() && Subtarget.useAVX512Regs())) {
42721+
SDValue Ops[] = {N.getOperand(0), N.getOperand(2)};
42722+
if (SDValue ConcatSrc =
42723+
combineConcatVectorOps(DL, WideVT, Ops, DAG, Subtarget)) {
42724+
SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
42725+
DL, WideVT.getSizeInBits());
42726+
SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, ConcatSrc);
42727+
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
42728+
DAG.getVectorIdxConstant(0, DL));
42729+
}
42730+
// See if we can concatenate the commuted operands (and then cheaply
42731+
// shuffle them, for constant shuffle masks this should fold away).
42732+
SDValue SwapOps[] = {N.getOperand(2), N.getOperand(0)};
42733+
if (SDValue ConcatSrc =
42734+
combineConcatVectorOps(DL, WideVT, SwapOps, DAG, Subtarget)) {
42735+
SmallVector<int, 16> SwapMask(WideVT.getVectorNumElements());
42736+
std::iota(SwapMask.begin(), SwapMask.begin() + NumElts, NumElts);
42737+
std::iota(SwapMask.begin() + NumElts, SwapMask.end(), 0);
42738+
SDValue Swap = DAG.getVectorShuffle(WideVT, DL, ConcatSrc,
42739+
DAG.getUNDEF(WideVT), SwapMask);
42740+
SDValue Mask = widenSubVector(N.getOperand(1), false, Subtarget, DAG,
42741+
DL, WideVT.getSizeInBits());
42742+
SDValue Perm = DAG.getNode(X86ISD::VPERMV, DL, WideVT, Mask, Swap);
42743+
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Perm,
42744+
DAG.getVectorIdxConstant(0, DL));
42745+
}
42746+
}
4274642747
return SDValue();
4274742748
}
4274842749
default:
@@ -43814,6 +43815,69 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
4381443815
}
4381543816
break;
4381643817
}
43818+
case X86ISD::VPERMV: {
43819+
SmallVector<int, 16> Mask;
43820+
SmallVector<SDValue, 2> Ops;
43821+
if ((VT.is256BitVector() || Subtarget.hasVLX()) &&
43822+
getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
43823+
// For lane-crossing shuffles, only split in half in case we're still
43824+
// referencing higher elements.
43825+
unsigned HalfElts = NumElts / 2;
43826+
unsigned HalfSize = SizeInBits / 2;
43827+
Mask.resize(HalfElts);
43828+
if (all_of(Mask,
43829+
[&](int M) { return isUndefOrInRange(M, 0, HalfElts); })) {
43830+
MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
43831+
SDLoc DL(Op);
43832+
SDValue Ext;
43833+
SDValue M =
43834+
extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize);
43835+
SDValue V =
43836+
extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, HalfSize);
43837+
// For 128-bit v2X64/v4X32 instructions, use VPERMILPD/VPERMILPS.
43838+
if (VT.is512BitVector() || VT.getScalarSizeInBits() <= 16)
43839+
Ext = TLO.DAG.getNode(Opc, DL, HalfVT, M, V);
43840+
else
43841+
Ext = TLO.DAG.getNode(X86ISD::VPERMILPV, DL, HalfVT, V, M);
43842+
SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
43843+
Subtarget, TLO.DAG, DL, SizeInBits);
43844+
return TLO.CombineTo(Op, Insert);
43845+
}
43846+
}
43847+
break;
43848+
}
43849+
case X86ISD::VPERMV3: {
43850+
SmallVector<int, 16> Mask;
43851+
SmallVector<SDValue, 2> Ops;
43852+
if (Subtarget.hasVLX() &&
43853+
getTargetShuffleMask(Op, /*AllowSentinelZero=*/false, Ops, Mask)) {
43854+
// For lane-crossing shuffles, only split in half in case we're still
43855+
// referencing higher elements.
43856+
unsigned HalfElts = NumElts / 2;
43857+
unsigned HalfSize = SizeInBits / 2;
43858+
Mask.resize(HalfElts);
43859+
if (all_of(Mask, [&](int M) {
43860+
return isUndefOrInRange(M, 0, HalfElts) ||
43861+
isUndefOrInRange(M, NumElts, NumElts + HalfElts);
43862+
})) {
43863+
// Adjust mask elements for 2nd operand to point to half width.
43864+
for (int &M : Mask)
43865+
M = M <= NumElts ? M : (M - HalfElts);
43866+
MVT HalfVT = VT.getSimpleVT().getHalfNumVectorElementsVT();
43867+
MVT HalfIntVT = HalfVT.changeVectorElementTypeToInteger();
43868+
SDLoc DL(Op);
43869+
SDValue Ext = TLO.DAG.getNode(
43870+
Opc, DL, HalfVT,
43871+
extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, HalfSize),
43872+
getConstVector(Mask, HalfIntVT, TLO.DAG, DL, /*IsMask=*/true),
43873+
extractSubVector(Op.getOperand(2), 0, TLO.DAG, DL, HalfSize));
43874+
SDValue Insert = widenSubVector(Ext, /*ZeroNewElements=*/false,
43875+
Subtarget, TLO.DAG, DL, SizeInBits);
43876+
return TLO.CombineTo(Op, Insert);
43877+
}
43878+
}
43879+
break;
43880+
}
4381743881
case X86ISD::VPERM2X128: {
4381843882
// Simplify VPERM2F128/VPERM2I128 to extract_subvector.
4381943883
SDLoc DL(Op);

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -749,10 +749,10 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
749749
;
750750
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
751751
; AVX512BW-SLOW: # %bb.0:
752-
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
753-
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,0,15]
754-
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
755-
; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm0
752+
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
753+
; AVX512BW-SLOW-NEXT: vmovdqa (%rdi), %ymm1
754+
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
755+
; AVX512BW-SLOW-NEXT: vpermw %ymm1, %ymm0, %ymm0
756756
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
757757
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rcx)
758758
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -763,7 +763,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
763763
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
764764
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,0,11,0,13,6,7]
765765
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
766-
; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
766+
; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
767767
; AVX512BW-FAST-NEXT: vmovd %xmm0, %eax
768768
; AVX512BW-FAST-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1
769769
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
@@ -870,7 +870,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
870870
; AVX512BW-SLOW-NEXT: vmovdqa64 (%rdi), %zmm0
871871
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,13,6,7]
872872
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
873-
; AVX512BW-SLOW-NEXT: vpermw %zmm0, %zmm1, %zmm1
873+
; AVX512BW-SLOW-NEXT: vpermw %ymm0, %ymm1, %ymm1
874874
; AVX512BW-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0
875875
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
876876
; AVX512BW-SLOW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -883,7 +883,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
883883
; AVX512BW-FAST-NEXT: vmovdqa64 (%rdi), %zmm0
884884
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm1 = [0,9,10,11,0,5,6,7]
885885
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
886-
; AVX512BW-FAST-NEXT: vpermw %zmm0, %zmm1, %zmm1
886+
; AVX512BW-FAST-NEXT: vpermw %ymm0, %ymm1, %ymm1
887887
; AVX512BW-FAST-NEXT: vextracti128 $1, %ymm0, %xmm0
888888
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5,6,7]
889889
; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0
@@ -1000,10 +1000,10 @@ define void @vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2(ptr %in.
10001000
;
10011001
; AVX512BW-LABEL: vec128_i32_widen_to_i64_factor2_broadcast_to_v2i64_factor2:
10021002
; AVX512BW: # %bb.0:
1003-
; AVX512BW-NEXT: vmovdqa64 (%rdi), %zmm0
1004-
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,0,7]
1005-
; AVX512BW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
1006-
; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0
1003+
; AVX512BW-NEXT: vpmovsxbd {{.*#+}} xmm0 = [0,5,0,7]
1004+
; AVX512BW-NEXT: vmovdqa (%rdi), %ymm1
1005+
; AVX512BW-NEXT: vpaddb (%rsi), %ymm1, %ymm1
1006+
; AVX512BW-NEXT: vpermd %ymm1, %ymm0, %ymm0
10071007
; AVX512BW-NEXT: vpaddb (%rdx), %zmm0, %zmm0
10081008
; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rcx)
10091009
; AVX512BW-NEXT: vzeroupper
@@ -4610,10 +4610,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
46104610
; AVX512F: # %bb.0:
46114611
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
46124612
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4613-
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
46144613
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4615-
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
4616-
; AVX512F-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4614+
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4615+
; AVX512F-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
4616+
; AVX512F-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
46174617
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
46184618
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
46194619
; AVX512F-NEXT: vzeroupper
@@ -4623,10 +4623,10 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
46234623
; AVX512DQ: # %bb.0:
46244624
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
46254625
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4626-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
46274626
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4628-
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,21,22,23]
4629-
; AVX512DQ-NEXT: vpermi2d %zmm1, %zmm0, %zmm2
4627+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4628+
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,13,14,15]
4629+
; AVX512DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2
46304630
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
46314631
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
46324632
; AVX512DQ-NEXT: vzeroupper
@@ -4868,10 +4868,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
48684868
; AVX512F: # %bb.0:
48694869
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
48704870
; AVX512F-NEXT: vmovdqa 32(%rdi), %ymm1
4871-
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
48724871
; AVX512F-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4873-
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
4874-
; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
4872+
; AVX512F-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4873+
; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
4874+
; AVX512F-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
48754875
; AVX512F-NEXT: vpaddb (%rdx), %ymm2, %ymm0
48764876
; AVX512F-NEXT: vmovdqa %ymm0, (%rcx)
48774877
; AVX512F-NEXT: vzeroupper
@@ -4881,10 +4881,10 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
48814881
; AVX512DQ: # %bb.0:
48824882
; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0
48834883
; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1
4884-
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
48854884
; AVX512DQ-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1
4886-
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,11]
4887-
; AVX512DQ-NEXT: vpermi2q %zmm1, %zmm0, %zmm2
4885+
; AVX512DQ-NEXT: vpaddb (%rsi), %ymm0, %ymm0
4886+
; AVX512DQ-NEXT: vpmovsxbq {{.*#+}} xmm2 = [0,7]
4887+
; AVX512DQ-NEXT: vpermi2q %ymm1, %ymm0, %ymm2
48884888
; AVX512DQ-NEXT: vpaddb (%rdx), %ymm2, %ymm0
48894889
; AVX512DQ-NEXT: vmovdqa %ymm0, (%rcx)
48904890
; AVX512DQ-NEXT: vzeroupper

llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -644,7 +644,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
644644
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
645645
; AVX512BW-SLOW: # %bb.0:
646646
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,0,15]
647-
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
647+
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
648648
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
649649
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
650650
; AVX512BW-SLOW-NEXT: vzeroupper
@@ -653,7 +653,7 @@ define void @vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4(ptr %in.
653653
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i32_factor2_broadcast_to_v4i32_factor4:
654654
; AVX512BW-FAST: # %bb.0:
655655
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,0,11,0,13,6,7]
656-
; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
656+
; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
657657
; AVX512BW-FAST-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
658658
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],mem[7]
659659
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
@@ -738,7 +738,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
738738
; AVX512BW-SLOW-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
739739
; AVX512BW-SLOW: # %bb.0:
740740
; AVX512BW-SLOW-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,13,6,7]
741-
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %zmm0, %zmm0
741+
; AVX512BW-SLOW-NEXT: vpermw (%rdi), %ymm0, %ymm0
742742
; AVX512BW-SLOW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],mem[3]
743743
; AVX512BW-SLOW-NEXT: vpaddb (%rsi), %zmm0, %zmm0
744744
; AVX512BW-SLOW-NEXT: vmovdqa64 %zmm0, (%rdx)
@@ -748,7 +748,7 @@ define void @vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2(ptr %in.
748748
; AVX512BW-FAST-LABEL: vec128_i16_widen_to_i64_factor4_broadcast_to_v2i64_factor2:
749749
; AVX512BW-FAST: # %bb.0:
750750
; AVX512BW-FAST-NEXT: vpmovsxbw {{.*#+}} xmm0 = [0,9,10,11,0,5,6,7]
751-
; AVX512BW-FAST-NEXT: vpermw (%rdi), %zmm0, %zmm0
751+
; AVX512BW-FAST-NEXT: vpermw (%rdi), %ymm0, %ymm0
752752
; AVX512BW-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],mem[5,6,7]
753753
; AVX512BW-FAST-NEXT: vpaddb (%rsi), %zmm0, %zmm0
754754
; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rdx)

llvm/test/CodeGen/X86/shuffle-vs-trunc-128.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1113,8 +1113,8 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
11131113
;
11141114
; AVX512VBMI-FAST-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
11151115
; AVX512VBMI-FAST: # %bb.0:
1116-
; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,79]
1117-
; AVX512VBMI-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2
1116+
; AVX512VBMI-FAST-NEXT: vmovdqa {{.*#+}} xmm1 = [64,65,66,67,68,69,24,28,32,36,40,44,48,52,56,79]
1117+
; AVX512VBMI-FAST-NEXT: vpmovdb %ymm0, %xmm2
11181118
; AVX512VBMI-FAST-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
11191119
; AVX512VBMI-FAST-NEXT: vextracti32x4 $3, %zmm0, %xmm0
11201120
; AVX512VBMI-FAST-NEXT: vpextrw $6, %xmm0, %eax
@@ -1124,14 +1124,14 @@ define <16 x i8> @evenelts_v32i16_trunc_v16i16_to_v16i8(<32 x i16> %n2) nounwind
11241124
;
11251125
; AVX512VBMI-SLOW-LABEL: evenelts_v32i16_trunc_v16i16_to_v16i8:
11261126
; AVX512VBMI-SLOW: # %bb.0:
1127-
; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,4,8,12,16,20,24,28,32,36,40,44,48,77,78,79]
1128-
; AVX512VBMI-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2
1129-
; AVX512VBMI-SLOW-NEXT: vpermi2b %zmm2, %zmm0, %zmm1
1127+
; AVX512VBMI-SLOW-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,92,96,100,104,108,112,13,14,15]
1128+
; AVX512VBMI-SLOW-NEXT: vpmovdb %ymm0, %xmm2
1129+
; AVX512VBMI-SLOW-NEXT: vpermt2b %zmm0, %zmm1, %zmm2
11301130
; AVX512VBMI-SLOW-NEXT: vextracti32x4 $3, %zmm0, %xmm0
11311131
; AVX512VBMI-SLOW-NEXT: vpextrw $6, %xmm0, %eax
11321132
; AVX512VBMI-SLOW-NEXT: vpextrw $4, %xmm0, %ecx
11331133
; AVX512VBMI-SLOW-NEXT: vpextrw $2, %xmm0, %edx
1134-
; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm1, %xmm0
1134+
; AVX512VBMI-SLOW-NEXT: vpinsrb $13, %edx, %xmm2, %xmm0
11351135
; AVX512VBMI-SLOW-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0
11361136
; AVX512VBMI-SLOW-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0
11371137
; AVX512VBMI-SLOW-NEXT: vzeroupper

0 commit comments

Comments
 (0)