@@ -35836,14 +35836,14 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35836
35836
return CanonicalizeShuffleInput(RootVT, V1);
35837
35837
}
35838
35838
35839
+ SmallVector<int, 64> Mask(BaseMask.begin(), BaseMask.end());
35840
+
35839
35841
// Adjust mask elements that pick from a splat input to be identity mask elts,
35840
35842
// i.e. to pick from the same lane of the input as the mask element is in.
35841
35843
// This may allow to simplify the shuffle into a blend.
35842
- SmallVector<int> NewMask;
35843
35844
if (InputIsSplat[0] || InputIsSplat[1]) {
35844
- NewMask.assign(BaseMask.begin(), BaseMask.end());
35845
35845
for (unsigned i = 0; i != NumBaseMaskElts; ++i) {
35846
- int &M = NewMask [i];
35846
+ int &M = Mask [i];
35847
35847
assert(isUndefOrZeroOrInRange(M, 0, 2 * NumBaseMaskElts) &&
35848
35848
"OOB mask element?");
35849
35849
if (M < 0)
@@ -35853,16 +35853,15 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35853
35853
if (InputIsSplat[InputIdx] && i < InputNumElts[InputIdx])
35854
35854
M = i + InputIdx * NumBaseMaskElts; // Pick from the same lane of input.
35855
35855
}
35856
- BaseMask = std::move(NewMask);
35857
35856
}
35858
35857
35859
35858
// See if the shuffle is a hidden identity shuffle - repeated args in HOPs
35860
35859
// etc. can be simplified.
35861
35860
if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits) {
35862
35861
SmallVector<int> ScaledMask, IdentityMask;
35863
35862
unsigned NumElts = VT1.getVectorNumElements();
35864
- if (BaseMask .size() <= NumElts &&
35865
- scaleShuffleElements(BaseMask , NumElts, ScaledMask)) {
35863
+ if (Mask .size() <= NumElts &&
35864
+ scaleShuffleElements(Mask , NumElts, ScaledMask)) {
35866
35865
for (unsigned i = 0; i != NumElts; ++i)
35867
35866
IdentityMask.push_back(i);
35868
35867
if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, V1, V2))
@@ -35876,22 +35875,22 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35876
35875
// If the upper subvectors are zeroable, then an extract+insert is more
35877
35876
// optimal than using X86ISD::SHUF128. The insertion is free, even if it has
35878
35877
// to zero the upper subvectors.
35879
- if (isUndefOrZeroInRange(BaseMask , 1, NumBaseMaskElts - 1)) {
35878
+ if (isUndefOrZeroInRange(Mask , 1, NumBaseMaskElts - 1)) {
35880
35879
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35881
35880
return SDValue(); // Nothing to do!
35882
- assert(isInRange(BaseMask [0], 0, NumBaseMaskElts) &&
35881
+ assert(isInRange(Mask [0], 0, NumBaseMaskElts) &&
35883
35882
"Unexpected lane shuffle");
35884
35883
Res = CanonicalizeShuffleInput(RootVT, V1);
35885
- unsigned SubIdx = BaseMask [0] * (NumRootElts / NumBaseMaskElts);
35886
- bool UseZero = isAnyZero(BaseMask );
35884
+ unsigned SubIdx = Mask [0] * (NumRootElts / NumBaseMaskElts);
35885
+ bool UseZero = isAnyZero(Mask );
35887
35886
Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
35888
35887
return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
35889
35888
}
35890
35889
35891
35890
// Narrow shuffle mask to v4x128.
35892
35891
SmallVector<int, 4> ScaledMask;
35893
35892
assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
35894
- narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask , ScaledMask);
35893
+ narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask , ScaledMask);
35895
35894
35896
35895
// Try to lower to vshuf64x2/vshuf32x4.
35897
35896
auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL,
@@ -35950,20 +35949,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35950
35949
// If the upper half is zeroable, then an extract+insert is more optimal
35951
35950
// than using X86ISD::VPERM2X128. The insertion is free, even if it has to
35952
35951
// zero the upper half.
35953
- if (isUndefOrZero(BaseMask [1])) {
35952
+ if (isUndefOrZero(Mask [1])) {
35954
35953
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35955
35954
return SDValue(); // Nothing to do!
35956
- assert(isInRange(BaseMask [0], 0, 2) && "Unexpected lane shuffle");
35955
+ assert(isInRange(Mask [0], 0, 2) && "Unexpected lane shuffle");
35957
35956
Res = CanonicalizeShuffleInput(RootVT, V1);
35958
- Res = extract128BitVector(Res, BaseMask [0] * (NumRootElts / 2), DAG, DL);
35959
- return widenSubVector(Res, BaseMask [1] == SM_SentinelZero, Subtarget, DAG,
35960
- DL, 256);
35957
+ Res = extract128BitVector(Res, Mask [0] * (NumRootElts / 2), DAG, DL);
35958
+ return widenSubVector(Res, Mask [1] == SM_SentinelZero, Subtarget, DAG, DL ,
35959
+ 256);
35961
35960
}
35962
35961
35963
35962
// If we're splatting the low subvector, an insert-subvector 'concat'
35964
35963
// pattern is quicker than VPERM2X128.
35965
35964
// TODO: Add AVX2 support instead of VPERMQ/VPERMPD.
35966
- if (BaseMask [0] == 0 && BaseMask [1] == 0 && !Subtarget.hasAVX2()) {
35965
+ if (Mask [0] == 0 && Mask [1] == 0 && !Subtarget.hasAVX2()) {
35967
35966
if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
35968
35967
return SDValue(); // Nothing to do!
35969
35968
Res = CanonicalizeShuffleInput(RootVT, V1);
@@ -35978,11 +35977,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35978
35977
// we need to use the zeroing feature.
35979
35978
// Prefer blends for sequential shuffles unless we are optimizing for size.
35980
35979
if (UnaryShuffle &&
35981
- !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask , 0, 2)) &&
35982
- (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask , 0, 2, 0))) {
35980
+ !(Subtarget.hasAVX2() && isUndefOrInRange(Mask , 0, 2)) &&
35981
+ (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask , 0, 2, 0))) {
35983
35982
unsigned PermMask = 0;
35984
- PermMask |= ((BaseMask [0] < 0 ? 0x8 : (BaseMask [0] & 1)) << 0);
35985
- PermMask |= ((BaseMask [1] < 0 ? 0x8 : (BaseMask [1] & 1)) << 4);
35983
+ PermMask |= ((Mask [0] < 0 ? 0x8 : (Mask [0] & 1)) << 0);
35984
+ PermMask |= ((Mask [1] < 0 ? 0x8 : (Mask [1] & 1)) << 4);
35986
35985
return DAG.getNode(
35987
35986
X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1),
35988
35987
DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8));
@@ -35993,16 +35992,15 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
35993
35992
35994
35993
// TODO - handle AVX512VL cases with X86ISD::SHUF128.
35995
35994
if (!UnaryShuffle && !IsMaskedShuffle) {
35996
- assert(llvm::all_of(BaseMask , [](int M) { return 0 <= M && M < 4; }) &&
35995
+ assert(llvm::all_of(Mask , [](int M) { return 0 <= M && M < 4; }) &&
35997
35996
"Unexpected shuffle sentinel value");
35998
35997
// Prefer blends to X86ISD::VPERM2X128.
35999
- if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
36000
- (BaseMask[0] == 2 && BaseMask[1] == 1))) {
35998
+ if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) {
36001
35999
unsigned PermMask = 0;
36002
- PermMask |= ((BaseMask [0] & 3) << 0);
36003
- PermMask |= ((BaseMask [1] & 3) << 4);
36004
- SDValue LHS = isInRange(BaseMask [0], 0, 2) ? V1 : V2;
36005
- SDValue RHS = isInRange(BaseMask [1], 0, 2) ? V1 : V2;
36000
+ PermMask |= ((Mask [0] & 3) << 0);
36001
+ PermMask |= ((Mask [1] & 3) << 4);
36002
+ SDValue LHS = isInRange(Mask [0], 0, 2) ? V1 : V2;
36003
+ SDValue RHS = isInRange(Mask [1], 0, 2) ? V1 : V2;
36006
36004
return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT,
36007
36005
CanonicalizeShuffleInput(RootVT, LHS),
36008
36006
CanonicalizeShuffleInput(RootVT, RHS),
@@ -36013,13 +36011,12 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
36013
36011
36014
36012
// For masks that have been widened to 128-bit elements or more,
36015
36013
// narrow back down to 64-bit elements.
36016
- SmallVector<int, 64> Mask;
36017
36014
if (BaseMaskEltSizeInBits > 64) {
36018
36015
assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
36019
36016
int MaskScale = BaseMaskEltSizeInBits / 64;
36020
- narrowShuffleMaskElts(MaskScale, BaseMask, Mask) ;
36021
- } else {
36022
- Mask.assign(BaseMask.begin(), BaseMask.end() );
36017
+ SmallVector<int, 64> ScaledMask ;
36018
+ narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
36019
+ Mask = std::move(ScaledMask );
36023
36020
}
36024
36021
36025
36022
// For masked shuffles, we're trying to match the root width for better
0 commit comments