@@ -14322,13 +14322,15 @@ static SDValue lowerShuffleAsSpecificZeroOrAnyExtend(
14322
14322
return SDValue();
14323
14323
MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
14324
14324
NumElements / Scale);
14325
+ InputV = DAG.getBitcast(VT, InputV);
14325
14326
InputV = ShuffleOffset(InputV);
14326
14327
InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND,
14327
14328
DL, ExtVT, InputV, DAG);
14328
14329
return DAG.getBitcast(VT, InputV);
14329
14330
}
14330
14331
14331
14332
assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
14333
+ InputV = DAG.getBitcast(VT, InputV);
14332
14334
14333
14335
// For any extends we can cheat for larger element sizes and use shuffle
14334
14336
// instructions that can fold with a load and/or copy.
@@ -15488,6 +15490,13 @@ static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
15488
15490
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
15489
15491
}
15490
15492
15493
+ if (Subtarget.hasSSE2())
15494
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
15495
+ DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) {
15496
+ ZExt = DAG.getBitcast(MVT::v4f32, ZExt);
15497
+ return ZExt;
15498
+ }
15499
+
15491
15500
if (Subtarget.hasAVX2())
15492
15501
if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG))
15493
15502
return Extract;
@@ -16872,7 +16881,7 @@ static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
16872
16881
/// AVX vector shuffle types.
16873
16882
static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16874
16883
SDValue V2, ArrayRef<int> Mask,
16875
- SelectionDAG &DAG) {
16884
+ SelectionDAG &DAG, bool SimpleOnly ) {
16876
16885
assert(VT.getSizeInBits() >= 256 &&
16877
16886
"Only for 256-bit or wider vector shuffles!");
16878
16887
assert(V1.getSimpleValueType() == VT && "Bad operand type!");
@@ -16900,34 +16909,60 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16900
16909
std::tie(LoV2, HiV2) = SplitVector(V2);
16901
16910
16902
16911
// Now create two 4-way blends of these half-width vectors.
16903
- auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16904
- bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
16905
- SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16906
- SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16907
- SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16912
+ auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1,
16913
+ bool &UseHiV1, bool &UseLoV2,
16914
+ bool &UseHiV2) {
16915
+ UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false;
16908
16916
for (int i = 0; i < SplitNumElements; ++i) {
16909
16917
int M = HalfMask[i];
16910
16918
if (M >= NumElements) {
16911
16919
if (M >= NumElements + SplitNumElements)
16912
16920
UseHiV2 = true;
16913
16921
else
16914
16922
UseLoV2 = true;
16915
- V2BlendMask[i] = M - NumElements;
16916
- BlendMask[i] = SplitNumElements + i;
16917
16923
} else if (M >= 0) {
16918
16924
if (M >= SplitNumElements)
16919
16925
UseHiV1 = true;
16920
16926
else
16921
16927
UseLoV1 = true;
16928
+ }
16929
+ }
16930
+ };
16931
+
16932
+ auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool {
16933
+ if (!SimpleOnly)
16934
+ return true;
16935
+
16936
+ bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
16937
+ GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
16938
+
16939
+ return !(UseHiV1 || UseHiV2);
16940
+ };
16941
+
16942
+ auto HalfBlend = [&](ArrayRef<int> HalfMask) {
16943
+ SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
16944
+ SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
16945
+ SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
16946
+ for (int i = 0; i < SplitNumElements; ++i) {
16947
+ int M = HalfMask[i];
16948
+ if (M >= NumElements) {
16949
+ V2BlendMask[i] = M - NumElements;
16950
+ BlendMask[i] = SplitNumElements + i;
16951
+ } else if (M >= 0) {
16922
16952
V1BlendMask[i] = M;
16923
16953
BlendMask[i] = i;
16924
16954
}
16925
16955
}
16926
16956
16957
+ bool UseLoV1, UseHiV1, UseLoV2, UseHiV2;
16958
+ GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2);
16959
+
16927
16960
// Because the lowering happens after all combining takes place, we need to
16928
16961
// manually combine these blend masks as much as possible so that we create
16929
16962
// a minimal number of high-level vector shuffle nodes.
16930
16963
16964
+ assert(!SimpleOnly || (!UseHiV1 && !UseHiV2) && "Shuffle won't be simple");
16965
+
16931
16966
// First try just blending the halves of V1 or V2.
16932
16967
if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
16933
16968
return DAG.getUNDEF(SplitVT);
@@ -16938,8 +16973,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16938
16973
16939
16974
SDValue V1Blend, V2Blend;
16940
16975
if (UseLoV1 && UseHiV1) {
16941
- V1Blend =
16942
- DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16976
+ V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
16943
16977
} else {
16944
16978
// We only use half of V1 so map the usage down into the final blend mask.
16945
16979
V1Blend = UseLoV1 ? LoV1 : HiV1;
@@ -16948,8 +16982,7 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16948
16982
BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
16949
16983
}
16950
16984
if (UseLoV2 && UseHiV2) {
16951
- V2Blend =
16952
- DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16985
+ V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
16953
16986
} else {
16954
16987
// We only use half of V2 so map the usage down into the final blend mask.
16955
16988
V2Blend = UseLoV2 ? LoV2 : HiV2;
@@ -16959,6 +16992,10 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
16959
16992
}
16960
16993
return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
16961
16994
};
16995
+
16996
+ if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask))
16997
+ return SDValue();
16998
+
16962
16999
SDValue Lo = HalfBlend(LoMask);
16963
17000
SDValue Hi = HalfBlend(HiMask);
16964
17001
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
@@ -17015,7 +17052,8 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
17015
17052
if (Mask[i] >= 0)
17016
17053
LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
17017
17054
if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
17018
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17055
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17056
+ /*SimpleOnly*/ false);
17019
17057
17020
17058
// Otherwise, just fall back to decomposed shuffles and a blend/unpack. This
17021
17059
// requires that the decomposed single-input shuffles don't end up here.
@@ -17163,6 +17201,20 @@ static SDValue lowerShuffleAsLanePermuteAndPermute(
17163
17201
return getSublanePermute(/*NumSublanes=*/NumLanes * 4);
17164
17202
}
17165
17203
17204
+ /// Helper to get compute inlane shuffle mask for a complete shuffle mask.
17205
+ static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize,
17206
+ SmallVector<int> &InLaneMask) {
17207
+ int Size = Mask.size();
17208
+ InLaneMask.assign(Mask.begin(), Mask.end());
17209
+ for (int i = 0; i < Size; ++i) {
17210
+ int &M = InLaneMask[i];
17211
+ if (M < 0)
17212
+ continue;
17213
+ if (((M % Size) / LaneSize) != (i / LaneSize))
17214
+ M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
17215
+ }
17216
+ }
17217
+
17166
17218
/// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one
17167
17219
/// source with a lane permutation.
17168
17220
///
@@ -17207,21 +17259,17 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
17207
17259
assert(V2.isUndef() &&
17208
17260
"This last part of this routine only works on single input shuffles");
17209
17261
17210
- SmallVector<int, 32> InLaneMask(Mask);
17211
- for (int i = 0; i < Size; ++i) {
17212
- int &M = InLaneMask[i];
17213
- if (M < 0)
17214
- continue;
17215
- if (((M % Size) / LaneSize) != (i / LaneSize))
17216
- M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size;
17217
- }
17262
+ SmallVector<int> InLaneMask;
17263
+ computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
17264
+
17218
17265
assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&
17219
17266
"In-lane shuffle mask expected");
17220
17267
17221
17268
// If we're not using both lanes in each lane and the inlane mask is not
17222
17269
// repeating, then we're better off splitting.
17223
17270
if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask))
17224
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
17271
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG,
17272
+ /*SimpleOnly*/ false);
17225
17273
17226
17274
// Flip the lanes, and shuffle the results which should now be in-lane.
17227
17275
MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
@@ -18356,6 +18404,19 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
18356
18404
Subtarget, DAG))
18357
18405
return Broadcast;
18358
18406
18407
+ if (!Subtarget.hasAVX2()) {
18408
+ SmallVector<int> InLaneMask;
18409
+ computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask);
18410
+
18411
+ if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask))
18412
+ if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG,
18413
+ /*SimpleOnly*/ true))
18414
+ return R;
18415
+ }
18416
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask,
18417
+ Zeroable, Subtarget, DAG))
18418
+ return DAG.getBitcast(MVT::v8f32, ZExt);
18419
+
18359
18420
// If the shuffle mask is repeated in each 128-bit lane, we have many more
18360
18421
// options to efficiently lower the shuffle.
18361
18422
SmallVector<int, 4> RepeatedMask;
@@ -18848,7 +18909,7 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
18848
18909
return V;
18849
18910
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
18850
18911
return V;
18851
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
18912
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false );
18852
18913
}
18853
18914
18854
18915
MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
@@ -19087,6 +19148,10 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19087
19148
Zeroable, Subtarget, DAG))
19088
19149
return Blend;
19089
19150
19151
+ if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(
19152
+ DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
19153
+ return DAG.getBitcast(MVT::v16f32, ZExt);
19154
+
19090
19155
// Try to create an in-lane repeating shuffle mask and then shuffle the
19091
19156
// results into the target lanes.
19092
19157
if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -19404,7 +19469,7 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
19404
19469
if (Subtarget.hasVBMI())
19405
19470
return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG);
19406
19471
19407
- return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
19472
+ return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false );
19408
19473
}
19409
19474
19410
19475
/// High-level routine to lower various 512-bit x86 vector shuffles.
@@ -19449,7 +19514,7 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
19449
19514
if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
19450
19515
return V;
19451
19516
19452
- return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
19517
+ return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false );
19453
19518
}
19454
19519
19455
19520
if (VT == MVT::v32f16) {
0 commit comments