28
28
#include "llvm/ADT/StringSwitch.h"
29
29
#include "llvm/Analysis/BlockFrequencyInfo.h"
30
30
#include "llvm/Analysis/ObjCARCUtil.h"
31
- #include "llvm/Analysis/ProfileSummaryInfo.h"
32
31
#include "llvm/Analysis/VectorUtils.h"
33
32
#include "llvm/CodeGen/IntrinsicLowering.h"
34
33
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -29782,8 +29781,8 @@ template <typename InputTy, typename PermutationTy,
29782
29781
8>>
29783
29782
static bool PermuteAndPairVector(
29784
29783
const InputTy &Inputs, PermutationTy &Permutation,
29785
- MapTy UnpairedInputs = MapTy()) {
29786
- const auto Wildcard = ~ typename InputTy::value_type() ;
29784
+ MapTy UnpairedInputs = MapTy()) {static_assert(std::is_same<typename InputTy::value_type, uint8_t>::value);
29785
+ const typename InputTy::value_type Wildcard = ~0 ;
29787
29786
SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
29788
29787
29789
29788
size_t OutputOffset = Permutation.size();
@@ -30155,14 +30154,16 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30155
30154
// amounts can be shuffled such that every pair or quad of adjacent elements
30156
30155
// has the same value. This introduces an extra shuffle before and after the
30157
30156
// shift, and it is profitable if the operand is aready a shuffle so that both
30158
- // can be merged and the extra shuffle is fast. This is not profitable on
30159
- // AVX512 becasue it has 16-bit vector variable shift instruction VPS**VW.
30157
+ // can be merged or the extra shuffle is fast.
30160
30158
// (shift (shuffle X P1) S1) ->
30161
30159
// (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
30162
30160
// widened, and P2^-1 is the inverse shuffle of P2.
30161
+ // This is not profitable on XOP or AVX512 becasue it has 8/16-bit vector
30162
+ // variable shift instructions.
30163
30163
if (ConstantAmt &&
30164
30164
(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
30165
- R.hasOneUse() && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
30165
+ R.hasOneUse() && Subtarget.hasSSSE3() && !Subtarget.hasAVX512() &&
30166
+ !Subtarget.hasXOP()) {
30166
30167
constexpr size_t LaneBytes = 16;
30167
30168
const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
30168
30169
@@ -30176,7 +30177,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30176
30177
}
30177
30178
30178
30179
// Check if we can find an in-lane shuffle to rearrange the shift amounts,
30179
- // if so, this transformation may be profitable.
30180
+ // if so, this transformation may be profitable. Cross-lane shuffle is
30181
+ // almost never profitable because there is no general 1-instruction
30182
+ // solution.
30180
30183
bool Profitable;
30181
30184
for (size_t I = 0; I < NumLanes; ++I) {
30182
30185
if (!(Profitable = PermuteAndPairVector(
@@ -30193,8 +30196,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30193
30196
for (size_t I = 0; I < Permutation.size(); I += 2) {
30194
30197
uint8_t Shift1 = ShiftAmt[Permutation[I]];
30195
30198
uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
30196
- assert(Shift1 == Shift2 || ~Shift1 == 0 || ~Shift2 == 0);
30197
- EveryOtherShiftAmt.push_back(~Shift1 ? Shift1 : Shift2);
30199
+ assert(Shift1 == Shift2 || Shift1 == (uint8_t) ~0 ||
30200
+ Shift2 == (uint8_t) ~0);
30201
+ EveryOtherShiftAmt.push_back(Shift1 != (uint8_t) ~0 ? Shift1 : Shift2);
30198
30202
}
30199
30203
SmallVector<int, 32> Permutation2;
30200
30204
for (size_t I = 0; I < NumLanes; ++I) {
@@ -30214,51 +30218,36 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30214
30218
}
30215
30219
}
30216
30220
30217
- // For right shifts, (V)PMULHUW needs an extra instruction to handle an
30218
- // amount of 0, disabling the transformation here to be cautious .
30221
+ // For right shifts, (V)PMULHUW needs 2 extra instructions to handle an
30222
+ // amount of 0, making it unprofitable .
30219
30223
if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
30220
30224
any_of(ShiftAmt, [](auto x) { return x == 0; }))
30221
30225
Profitable = false;
30222
30226
30223
30227
bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
30224
- // If operand R is not a shuffle by itself, the transformation here adds two
30225
- // shuffles, adding a non-trivial cost. Here we take out a few cases where
30226
- // the benefit is questionable according to llvm-mca's modeling.
30227
- //
30228
- // Each cell shows latency before/after transform. Here R is not a shuffle.
30229
- // SSE3
30230
- // | v16i8 | v32i8 | v64i8
30231
- // ----------------------------
30232
- // SLL | 17/17 | 20/20 | 26/26
30233
- // SRL | 18/17 | 22/20 | 35/26
30234
- // SRA | 21/19 | 26/22 | 39/30
30235
- // AVX2 using VPMUL*W
30236
- // | v16i8 | v32i8 | v64i8
30237
- // ----------------------------
30238
- // SLL | 20/18 | 18/18 | 21/21
30239
- // SRL | 20/18 | 22/18 | 26/21
30240
- // SRA | 20/20 | 22/20 | 25/23
30241
- // AVX2 using VPS*LVD
30242
- // | v16i8 | v32i8 | v64i8
30243
- // ----------------------------
30244
- // SLL | 20/16 | 18/16 | 21/20
30245
- // SRL | 20/16 | 22/16 | 26/20
30246
- // SRA | 20/18 | 22/18 | 25/22
30228
+ // If operand R is a shuffle, one of the two shuffles introduced by this
30229
+ // transformation can be merged with it, and the extrast shuffle is 1 cycle.
30230
+ // This is generally profitable because it eliminates one (or both) vector
30231
+ // multiplication, which has to be scheduled at least 1 cycle apart.
30232
+ // If operand R is not a shuffle, several cases are not profitable based on
30233
+ // pipeline modeling, so we are excluding them here.
30247
30234
if (!IsOperandShuffle) {
30248
- if (Subtarget.hasAVX2()) {
30249
- if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
30235
+ // A hack to detect AMD CPU.
30236
+ if (Subtarget.hasSSE4A() && Opc == ISD::SRA) {
30237
+ if (Opc == ISD::SRA)
30250
30238
Profitable = false;
30251
30239
} else {
30252
- if (Opc == ISD::SHL ||
30253
- ((VT == MVT::v16i8 || VT == MVT::v32i8 ) && Opc == ISD::SRL ))
30240
+ if ((Subtarget.hasAVX() && !Subtarget.hasAVX2()) ||
30241
+ (Subtarget.hasAVX2( ) && !IsAdjacentQuads ))
30254
30242
Profitable = false;
30255
30243
}
30256
30244
}
30257
30245
30258
30246
// Found a permutation P that can rearrange the shift amouts into adjacent
30259
30247
// pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
30260
30248
if (Profitable) {
30261
- SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
30249
+ SDValue InnerShuffle =
30250
+ DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
30262
30251
SmallVector<SDValue, 64> NewShiftAmt;
30263
30252
for (int Index : Permutation) {
30264
30253
NewShiftAmt.push_back(Amt.getOperand(Index));
@@ -30267,7 +30256,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30267
30256
for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
30268
30257
SDValue Even = NewShiftAmt[I];
30269
30258
SDValue Odd = NewShiftAmt[I + 1];
30270
- assert(Even.isUndef() || Odd.isUndef() || Even->getAsZExtVal() == Odd->getAsZExtVal());
30259
+ assert(Even.isUndef() || Odd.isUndef() ||
30260
+ Even->getAsZExtVal() == Odd->getAsZExtVal());
30271
30261
}
30272
30262
#endif
30273
30263
SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
@@ -30276,7 +30266,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
30276
30266
for (size_t I = 0; I < Permutation.size(); ++I) {
30277
30267
InversePermutation[Permutation[I]] = I;
30278
30268
}
30279
- SDValue OuterShuffle = DAG.getVectorShuffle(VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
30269
+ SDValue OuterShuffle = DAG.getVectorShuffle(
30270
+ VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
30280
30271
return OuterShuffle;
30281
30272
}
30282
30273
}
0 commit comments