@@ -29769,24 +29769,27 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
29769
29769
// Given a vector of values, find a permutation such that every adjacent even-
29770
29770
// odd pair has the same value. ~0 is reserved as a special value for wildcard,
29771
29771
// which can be paired with any value. Returns true if a permutation is found.
29772
+ // If output Permutation is not empty, permutation index starts at its previous
29773
+ // size, so that this function can concatenate the result of multiple calls.
29774
+ // UnpairedInputs contains values yet to be paired, mapping an unpaired value to
29775
+ // its current neighbor's value and index.
29776
+ // Do not use llvm::DenseMap as ~0 is reserved key.
29772
29777
template <typename InputTy,
29773
29778
typename PermutationTy,
29774
- typename MapTy = std::unordered_map <typename InputTy::value_type,
29775
- std::pair<typename InputTy::value_type, typename PermutationTy::value_type>>>
29779
+ typename MapTy = SmallMapVector <typename InputTy::value_type,
29780
+ std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8 >>
29776
29781
static bool PermuteAndPairVector(const InputTy& Inputs,
29777
- PermutationTy &Permutation) {
29782
+ PermutationTy &Permutation,
29783
+ MapTy UnpairedInputs = SmallMapVector<typename InputTy::value_type,
29784
+ std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>()) {
29778
29785
const auto Wildcard = ~typename InputTy::value_type();
29779
-
29780
- // List of values to be paired, mapping an unpaired value to its current
29781
- // neighbor's value and index.
29782
- MapTy UnpairedInputs;
29783
29786
SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
29784
29787
29785
- Permutation.clear ();
29788
+ size_t OutputOffset = Permutation.size ();
29786
29789
typename PermutationTy::value_type I = 0;
29787
29790
for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); InputIt != InputEnd;) {
29788
- Permutation.push_back(I);
29789
- Permutation.push_back(I + 1);
29791
+ Permutation.push_back(OutputOffset + I);
29792
+ Permutation.push_back(OutputOffset + I + 1);
29790
29793
29791
29794
auto Even = *InputIt++;
29792
29795
assert(InputIt != InputEnd && "Expected even number of elements");
@@ -29806,7 +29809,7 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
29806
29809
// value's neighbor, otherwise the current value is added to the map.
29807
29810
if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(This, std::make_pair(Other, OtherIndex)); !Inserted) {
29808
29811
auto [SwapValue, SwapIndex] = MapIt->second;
29809
- std::swap(Permutation[SwapIndex], Permutation[ThisIndex]);
29812
+ std::swap(Permutation[OutputOffset + SwapIndex], Permutation[OutputOffset + ThisIndex]);
29810
29813
This = SwapValue;
29811
29814
UnpairedInputs.erase(MapIt);
29812
29815
@@ -29850,13 +29853,12 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
29850
29853
if (Neighbor != Wildcard) {
29851
29854
assert(UnpairedInputs.count(Neighbor));
29852
29855
if (WildcardPairs.size()) {
29853
- std::swap(Permutation[WildcardPairs.back()], Permutation[NeighborIndex]);
29856
+ std::swap(Permutation[OutputOffset + WildcardPairs.back()], Permutation[OutputOffset + NeighborIndex]);
29854
29857
WildcardPairs.pop_back();
29855
29858
// Mark the neighbor as processed.
29856
29859
UnpairedInputs[Neighbor].first = Wildcard;
29857
- } else {
29860
+ } else
29858
29861
return false;
29859
- }
29860
29862
}
29861
29863
}
29862
29864
return true;
@@ -30140,87 +30142,107 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30140
30142
}
30141
30143
}
30142
30144
30143
- // ISD::SRA/SRL/SHL on vXi8 can be widened to vYi16 (Y = X/2) if the constant
30144
- // amounts can be shuffled such that every pair of adjacent elements has the
30145
- // same value. This introduces an extra shuffle before and after the shift,
30146
- // and it is profitable if the operand is aready a shuffle so that both can
30147
- // be merged, or if the extra shuffle is fast (can use VPSHUFB).
30145
+ // SHL/SRL/SRA on vXi8 can be widened to vYi16 or vYi32 if the constant
30146
+ // amounts can be shuffled such that every pair or quad of adjacent elements
30147
+ // has the same value. This introduces an extra shuffle before and after the
30148
+ // shift, and it is profitable if the operand is aready a shuffle so that both
30149
+ // can be merged and the extra shuffle is fast. This is not profitable on
30150
+ // AVX512 becasue it has 16-bit vector variable shift instruction VPS**VW.
30148
30151
// (shift (shuffle X P1) S1) ->
30149
30152
// (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
30150
30153
// widened, and P2^-1 is the inverse shuffle of P2.
30151
- if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse() && Subtarget.hasSSE3()) {
30152
- bool Profitable = true;
30153
- // VPAND ymm only available on AVX2.
30154
- if (VT == MVT::v32i8 || VT == MVT::v64i8) {
30155
- Profitable = Subtarget.hasAVX2();
30156
- }
30154
+ if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse()
30155
+ && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
30156
+ constexpr size_t LaneBytes = 16;
30157
+ const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
30157
30158
30158
30159
SmallVector<int, 64> Permutation;
30159
- SmallVector<uint16_t , 64> ShiftAmt;
30160
+ SmallVector<uint8_t , 64> ShiftAmt;
30160
30161
for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
30161
30162
if (Amt.getOperand(I).isUndef())
30162
30163
ShiftAmt.push_back(~0);
30163
30164
else
30164
30165
ShiftAmt.push_back(Amt.getConstantOperandVal(I));
30165
30166
}
30166
30167
30167
- if (Profitable && (VT == MVT::v32i8 || VT == MVT::v64i8)) {
30168
- Profitable = false;
30169
- constexpr size_t LaneBytes = 16;
30170
- const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
30171
-
30172
- // For v32i8 or v64i8, we should check if we can generate a shuffle that
30173
- // may be lowered to VPSHUFB, because it is faster than VPERMB. This is
30174
- // possible if we can apply the same shuffle mask to each v16i8 lane.
30175
- // For example (assuming a lane has 4 elements for simplicity),
30176
- // <1, 2, 2, 1, 4, 3, 3, 4> is handled as <14, 23, 23, 14>, which can
30177
- // be shuffled to adjacent pairs <14, 14, 23, 23> with the VPSHUFB mask
30178
- // <0, 3, 2, 1> (or high level mask <0, 3, 2, 1, 4, 7, 6, 5>).
30179
- // Limitation: if there are some undef in shift amounts, this algorithm
30180
- // may not find a solution even if one exists, as here we only treat a
30181
- // VPSHUFB index as undef if all shuffle amounts of the same index modulo
30182
- // lane size are all undef.
30183
- // Since a byte can only be shifted by 7 bits without being UB, 4 bits are
30184
- // enough to represent the shift amount or undef (0xF).
30185
- std::array<uint16_t, LaneBytes> VPSHUFBShiftAmt = {};
30186
- for (size_t I = 0; I < LaneBytes; ++I)
30187
- for (size_t J = 0; J < NumLanes; ++J)
30188
- VPSHUFBShiftAmt[I] |= (ShiftAmt[I + J * LaneBytes] & 0xF) << (J * 4);
30189
- if (VT == MVT::v32i8) {
30190
- for (size_t I = 0; I < LaneBytes; ++I)
30191
- VPSHUFBShiftAmt[I] |= 0xFF00;
30192
- }
30193
- if (PermuteAndPairVector(VPSHUFBShiftAmt, Permutation)) {
30194
- // Found a VPSHUFB solution, offset the shuffle amount to other lanes.
30195
- Permutation.resize(VT.getVectorNumElements());
30196
- for (size_t I = 0; I < LaneBytes; ++I)
30197
- for (size_t J = 1; J < NumLanes; ++J)
30198
- Permutation[I + J * LaneBytes] = Permutation[I] + J * LaneBytes;
30199
- Profitable = true;
30200
- } else if (R.getOpcode() == ISD::VECTOR_SHUFFLE) {
30201
- // A slower shuffle is profitable if the operand is also a slow shuffle,
30202
- // such that they can be merged.
30203
- // TODO: Use TargetTransformInfo to systematically determine whether
30204
- // inner shuffle is slow. Currently we only check if it contains
30205
- // cross-lane shuffle.
30206
- if (ShuffleVectorSDNode *InnerShuffle = dyn_cast<ShuffleVectorSDNode>(R.getNode())) {
30207
- if (InnerShuffle->getMask().size() == VT.getVectorNumElements() &&
30208
- is128BitLaneCrossingShuffleMask(VT, InnerShuffle->getMask()))
30209
- Profitable = true;
30168
+ // Check if we can find an in-lane shuffle to rearrange the shift amounts,
30169
+ // if so, this transformation may be profitable.
30170
+ bool Profitable;
30171
+ for (size_t I = 0; I < NumLanes; ++I) {
30172
+ if (!(Profitable = PermuteAndPairVector(ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
30173
+ break;
30174
+ }
30175
+
30176
+ // For AVX2, check if we can further rearrange shift amounts into adjacent
30177
+ // quads, so that it can use VPS*LVD instead of VPMUL*W as it is 2 cycles
30178
+ // faster.
30179
+ bool IsAdjacentQuads = false;
30180
+ if (Profitable && Subtarget.hasAVX2()) {
30181
+ SmallVector<uint8_t, 64> EveryOtherShiftAmt;
30182
+ for (size_t I = 0; I < Permutation.size(); I += 2) {
30183
+ uint8_t Shift1 = ShiftAmt[Permutation[I]];
30184
+ uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
30185
+ assert(Shift1 == Shift2 || ~Shift1 == 0 || ~Shift2 == 0);
30186
+ EveryOtherShiftAmt.push_back(~Shift1 ? Shift1 : Shift2);
30187
+ }
30188
+ SmallVector<int, 32> Permutation2;
30189
+ for (size_t I = 0; I < NumLanes; ++I) {
30190
+ if (!(IsAdjacentQuads = PermuteAndPairVector(ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2], LaneBytes / 2), Permutation2)))
30191
+ break;
30192
+ }
30193
+ if (IsAdjacentQuads) {
30194
+ SmallVector<int, 64> CombinedPermutation;
30195
+ for (int Index : Permutation2) {
30196
+ CombinedPermutation.push_back(Permutation[Index * 2]);
30197
+ CombinedPermutation.push_back(Permutation[Index * 2 + 1]);
30210
30198
}
30199
+ std::swap(Permutation, CombinedPermutation);
30211
30200
}
30212
30201
}
30213
30202
30214
- // If it is still profitable at this point, and has not found a permutation
30215
- // yet, try again with any shuffle index.
30216
- if (Profitable && Permutation.empty()) {
30217
- PermuteAndPairVector<decltype(ShiftAmt), decltype(Permutation),
30218
- SmallMapVector<uint16_t, std::pair<uint16_t, int>, 8>>(ShiftAmt, Permutation);
30203
+ // For right shifts, (V)PMULHUW needs an extra instruction to handle an
30204
+ // amount of 0, disabling the transformation here to be cautious.
30205
+ if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
30206
+ any_of(ShiftAmt, [](auto x) { return x == 0; }))
30207
+ Profitable = false;
30208
+
30209
+ bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
30210
+ // If operand R is not a shuffle by itself, the transformation here adds two
30211
+ // shuffles, adding a non-trivial cost. Here we take out a few cases where
30212
+ // the benefit is questionable according to llvm-mca's modeling.
30213
+ //
30214
+ // Each cell shows latency before/after transform. Here R is not a shuffle.
30215
+ // SSE3
30216
+ // | v16i8 | v32i8 | v64i8
30217
+ // ----------------------------
30218
+ // SLL | 17/17 | 20/20 | 26/26
30219
+ // SRL | 18/17 | 22/20 | 35/26
30220
+ // SRA | 21/19 | 26/22 | 39/30
30221
+ // AVX2 using VPMUL*W
30222
+ // | v16i8 | v32i8 | v64i8
30223
+ // ----------------------------
30224
+ // SLL | 20/18 | 18/18 | 21/21
30225
+ // SRL | 20/18 | 22/18 | 26/21
30226
+ // SRA | 20/20 | 22/20 | 25/23
30227
+ // AVX2 using VPS*LVD
30228
+ // | v16i8 | v32i8 | v64i8
30229
+ // ----------------------------
30230
+ // SLL | 20/16 | 18/16 | 21/20
30231
+ // SRL | 20/16 | 22/16 | 26/20
30232
+ // SRA | 20/18 | 22/18 | 25/22
30233
+ if (!IsOperandShuffle) {
30234
+ if (Subtarget.hasAVX2()) {
30235
+ if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
30236
+ Profitable = false;
30237
+ } else {
30238
+ if (Opc == ISD::SHL || ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
30239
+ Profitable = false;
30240
+ }
30219
30241
}
30220
30242
30221
30243
// Found a permutation P that can rearrange the shift amouts into adjacent
30222
- // pair of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
30223
- if (!Permutation.empty() ) {
30244
+ // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
30245
+ if (Profitable ) {
30224
30246
SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
30225
30247
SmallVector<SDValue, 64> NewShiftAmt;
30226
30248
for (int Index : Permutation) {
0 commit comments