@@ -35796,19 +35796,6 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35796
35796
(RootVT.isFloatingPoint() && Depth >= 1) ||
35797
35797
(RootVT.is256BitVector() && !Subtarget.hasAVX2());
35798
35798
35799
- // How many elements does each of the inputs have, given the current
35800
- // granularity of the root shuffle? Note that while currently the sizes of an
35801
- // inputs must match the size of the shuffle root,
35802
- // that restriction will be lifted in the future.
35803
- SmallVector<unsigned, 2> InputNumElts;
35804
- llvm::transform(std::initializer_list<MVT>({VT1, VT2}),
35805
- std::back_inserter(InputNumElts),
35806
- [BaseMaskEltSizeInBits](MVT VT) {
35807
- assert(VT.getSizeInBits() % BaseMaskEltSizeInBits == 0 &&
35808
- "Input is not a multiple of output element width?");
35809
- return VT.getSizeInBits() / BaseMaskEltSizeInBits;
35810
- });
35811
-
35812
35799
// Don't combine if we are a AVX512/EVEX target and the mask element size
35813
35800
// is different from the root element size - this would prevent writemasks
35814
35801
// from being reused.
@@ -35823,44 +35810,19 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35823
35810
// If we are shuffling a broadcast (and not introducing zeros) then
35824
35811
// we can just use the broadcast directly. This works for smaller broadcast
35825
35812
// elements as well as they already repeat across each mask element
35826
- SmallVector<bool, 2> InputIsSplat;
35827
- llvm::transform(
35828
- std::initializer_list<SDValue>({V1, V2}),
35829
- std::back_inserter(InputIsSplat), [BaseMaskEltSizeInBits](SDValue V) {
35830
- return isTargetShuffleSplat(V) &&
35831
- (BaseMaskEltSizeInBits % V.getScalarValueSizeInBits()) == 0;
35832
- });
35833
- if (UnaryShuffle && InputIsSplat[0] && !isAnyZero(BaseMask) &&
35813
+ if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
35814
+ (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 &&
35834
35815
V1.getValueSizeInBits() >= RootSizeInBits) {
35835
35816
return CanonicalizeShuffleInput(RootVT, V1);
35836
35817
}
35837
35818
35838
- SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
35839
-
35840
- // Adjust mask elements that pick from a splat input to be identity mask elts,
35841
- // i.e. to pick from the same lane of the input as the mask element is in.
35842
- // This may allow to simplify the shuffle into a blend.
35843
- if (InputIsSplat[0] || InputIsSplat[1]) {
35844
- for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
35845
- int &M = Mask[i];
35846
- assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
35847
- "OOB mask element?");
35848
- if (M < 0)
35849
- continue; // Keep the undef/zero mask elements as-is.
35850
- int InputIdx = (unsigned)M < NumBaseMaskElts ? 0 : 1;
35851
- // Is the used input wide-enough to contain that lane, and is it a splat?
35852
- if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
35853
- M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
35854
- }
35855
- }
35856
-
35857
35819
// See if the shuffle is a hidden identity shuffle - repeated args in HOPs
35858
35820
// etc. can be simplified.
35859
35821
if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
35860
35822
SmallVector<int> ScaledMask, IdentityMask;
35861
35823
unsigned NumElts = VT1.getVectorNumElements();
35862
- if (Mask .size() <= NumElts &&
35863
- scaleShuffleElements(Mask , NumElts, ScaledMask)) {
35824
+ if (BaseMask .size() <= NumElts &&
35825
+ scaleShuffleElements(BaseMask , NumElts, ScaledMask)) {
35864
35826
for (unsigned i = 0; i != NumElts; ++i)
35865
35827
IdentityMask.push_back(i);
35866
35828
if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
@@ -35874,22 +35836,22 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35874
35836
// If the upper subvectors are zeroable, then an extract+insert is more
35875
35837
// optimal than using X86ISD::SHUF128. The insertion is free, even if it has
35876
35838
// to zero the upper subvectors.
35877
- if (isUndefOrZeroInRange(Mask , 1, NumBaseMaskElts - 1)) {
35839
+ if (isUndefOrZeroInRange(BaseMask , 1, NumBaseMaskElts - 1)) {
35878
35840
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35879
35841
return SDValue(); // Nothing to do!
35880
- assert(isInRange(Mask [0], 0, NumBaseMaskElts) &&
35842
+ assert(isInRange(BaseMask [0], 0, NumBaseMaskElts) &&
35881
35843
"Unexpected lane shuffle");
35882
35844
Res = CanonicalizeShuffleInput(RootVT, V1);
35883
- unsigned SubIdx = Mask [0] * (NumRootElts / NumBaseMaskElts);
35884
- bool UseZero = isAnyZero(Mask );
35845
+ unsigned SubIdx = BaseMask [0] * (NumRootElts / NumBaseMaskElts);
35846
+ bool UseZero = isAnyZero(BaseMask );
35885
35847
Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
35886
35848
return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
35887
35849
}
35888
35850
35889
35851
// Narrow shuffle mask to v4x128.
35890
35852
SmallVector<int, 4> ScaledMask;
35891
35853
assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
35892
- narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask , ScaledMask);
35854
+ narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask , ScaledMask);
35893
35855
35894
35856
// Try to lower to vshuf64x2/vshuf32x4.
35895
35857
auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
@@ -35948,20 +35910,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35948
35910
// If the upper half is zeroable, then an extract+insert is more optimal
35949
35911
// than using X86ISD::VPERM2X128. The insertion is free, even if it has to
35950
35912
// zero the upper half.
35951
- if (isUndefOrZero(Mask [1])) {
35913
+ if (isUndefOrZero(BaseMask [1])) {
35952
35914
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35953
35915
return SDValue(); // Nothing to do!
35954
- assert(isInRange(Mask [0], 0, 2) && "Unexpected lane shuffle");
35916
+ assert(isInRange(BaseMask [0], 0, 2) && "Unexpected lane shuffle");
35955
35917
Res = CanonicalizeShuffleInput(RootVT, V1);
35956
- Res = extract128BitVector(Res, Mask [0] * (NumRootElts / 2), DAG, DL);
35957
- return widenSubVector(Res, Mask [1] == SM_SentinelZero, Subtarget, DAG, DL ,
35958
- 256);
35918
+ Res = extract128BitVector(Res, BaseMask [0] * (NumRootElts / 2), DAG, DL);
35919
+ return widenSubVector(Res, BaseMask [1] == SM_SentinelZero, Subtarget, DAG,
35920
+ DL, 256);
35959
35921
}
35960
35922
35961
35923
// If we're splatting the low subvector, an insert-subvector 'concat'
35962
35924
// pattern is quicker than VPERM2X128.
35963
35925
// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
35964
- if (Mask [0] == 0 && Mask [1] == 0 && !Subtarget.hasAVX2()) {
35926
+ if (BaseMask [0] == 0 && BaseMask [1] == 0 && !Subtarget.hasAVX2()) {
35965
35927
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35966
35928
return SDValue(); // Nothing to do!
35967
35929
Res = CanonicalizeShuffleInput(RootVT, V1);
@@ -35976,11 +35938,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35976
35938
// we need to use the zeroing feature.
35977
35939
// Prefer blends for sequential shuffles unless we are optimizing for size.
35978
35940
if (UnaryShuffle &&
35979
- !(Subtarget.hasAVX2() && isUndefOrInRange(Mask , 0, 2)) &&
35980
- (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask , 0, 2, 0))) {
35941
+ !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask , 0, 2)) &&
35942
+ (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask , 0, 2, 0))) {
35981
35943
unsigned PermMask = 0;
35982
- PermMask |= ((Mask [0] < 0 ? 0x8 : (Mask [0] & 1)) << 0);
35983
- PermMask |= ((Mask [1] < 0 ? 0x8 : (Mask [1] & 1)) << 4);
35944
+ PermMask |= ((BaseMask [0] < 0 ? 0x8 : (BaseMask [0] & 1)) << 0);
35945
+ PermMask |= ((BaseMask [1] < 0 ? 0x8 : (BaseMask [1] & 1)) << 4);
35984
35946
return DAG.getNode(
35985
35947
X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
35986
35948
DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
@@ -35991,15 +35953,16 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35991
35953
35992
35954
// TODO - handle AVX512VL cases with X86ISD::SHUF128.
35993
35955
if (!UnaryShuffle && !IsMaskedShuffle) {
35994
- assert(llvm::all_of(Mask , [](int M) { return 0 <= M && M < 4; }) &&
35956
+ assert(llvm::all_of(BaseMask , [](int M) { return 0 <= M && M < 4; }) &&
35995
35957
"Unexpected shuffle sentinel value");
35996
35958
// Prefer blends to X86ISD::VPERM2X128.
35997
- if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
35959
+ if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
35960
+ (BaseMask[0] == 2 && BaseMask[1] == 1))) {
35998
35961
unsigned PermMask = 0;
35999
- PermMask |= ((Mask [0] & 3) << 0);
36000
- PermMask |= ((Mask [1] & 3) << 4);
36001
- SDValue LHS = isInRange(Mask [0], 0, 2) ? V1 : V2;
36002
- SDValue RHS = isInRange(Mask [1], 0, 2) ? V1 : V2;
35962
+ PermMask |= ((BaseMask [0] & 3) << 0);
35963
+ PermMask |= ((BaseMask [1] & 3) << 4);
35964
+ SDValue LHS = isInRange(BaseMask [0], 0, 2) ? V1 : V2;
35965
+ SDValue RHS = isInRange(BaseMask [1], 0, 2) ? V1 : V2;
36003
35966
return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
36004
35967
CanonicalizeShuffleInput(RootVT, LHS),
36005
35968
CanonicalizeShuffleInput(RootVT, RHS),
@@ -36010,12 +35973,13 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
36010
35973
36011
35974
// For masks that have been widened to 128-bit elements or more,
36012
35975
// narrow back down to 64-bit elements.
35976
+ SmallVector<int, 64> Mask;
36013
35977
if (BaseMaskEltSizeInBits > 64) {
36014
35978
assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
36015
35979
int MaskScale = BaseMaskEltSizeInBits / 64;
36016
- SmallVector<int, 64> ScaledMask ;
36017
- narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
36018
- Mask = std::move(ScaledMask );
35980
+ narrowShuffleMaskElts(MaskScale, BaseMask, Mask) ;
35981
+ } else {
35982
+ Mask.assign(BaseMask.begin(), BaseMask.end() );
36019
35983
}
36020
35984
36021
35985
// For masked shuffles, we're trying to match the root width for better
0 commit comments