Skip to content

Commit 0bdbc64

Browse files
committed
bug fixes
1 parent 02249f3 commit 0bdbc64

File tree

1 file changed

+32
-41
lines changed

1 file changed

+32
-41
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 32 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828
#include "llvm/ADT/StringSwitch.h"
2929
#include "llvm/Analysis/BlockFrequencyInfo.h"
3030
#include "llvm/Analysis/ObjCARCUtil.h"
31-
#include "llvm/Analysis/ProfileSummaryInfo.h"
3231
#include "llvm/Analysis/VectorUtils.h"
3332
#include "llvm/CodeGen/IntrinsicLowering.h"
3433
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -29782,8 +29781,8 @@ template <typename InputTy, typename PermutationTy,
2978229781
8>>
2978329782
static bool PermuteAndPairVector(
2978429783
const InputTy &Inputs, PermutationTy &Permutation,
29785-
MapTy UnpairedInputs = MapTy()) {
29786-
const auto Wildcard = ~typename InputTy::value_type();
29784+
MapTy UnpairedInputs = MapTy()) {static_assert(std::is_same<typename InputTy::value_type, uint8_t>::value);
29785+
const typename InputTy::value_type Wildcard = ~0;
2978729786
SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
2978829787

2978929788
size_t OutputOffset = Permutation.size();
@@ -30155,14 +30154,16 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3015530154
// amounts can be shuffled such that every pair or quad of adjacent elements
3015630155
// has the same value. This introduces an extra shuffle before and after the
3015730156
// shift, and it is profitable if the operand is aready a shuffle so that both
30158-
// can be merged and the extra shuffle is fast. This is not profitable on
30159-
// AVX512 becasue it has 16-bit vector variable shift instruction VPS**VW.
30157+
// can be merged or the extra shuffle is fast.
3016030158
// (shift (shuffle X P1) S1) ->
3016130159
// (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
3016230160
// widened, and P2^-1 is the inverse shuffle of P2.
30161+
// This is not profitable on XOP or AVX512 becasue it has 8/16-bit vector
30162+
// variable shift instructions.
3016330163
if (ConstantAmt &&
3016430164
(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
30165-
R.hasOneUse() && Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
30165+
R.hasOneUse() && Subtarget.hasSSSE3() && !Subtarget.hasAVX512() &&
30166+
!Subtarget.hasXOP()) {
3016630167
constexpr size_t LaneBytes = 16;
3016730168
const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
3016830169

@@ -30176,7 +30177,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3017630177
}
3017730178

3017830179
// Check if we can find an in-lane shuffle to rearrange the shift amounts,
30179-
// if so, this transformation may be profitable.
30180+
// if so, this transformation may be profitable. Cross-lane shuffle is
30181+
// almost never profitable because there is no general 1-instruction
30182+
// solution.
3018030183
bool Profitable;
3018130184
for (size_t I = 0; I < NumLanes; ++I) {
3018230185
if (!(Profitable = PermuteAndPairVector(
@@ -30193,8 +30196,9 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3019330196
for (size_t I = 0; I < Permutation.size(); I += 2) {
3019430197
uint8_t Shift1 = ShiftAmt[Permutation[I]];
3019530198
uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
30196-
assert(Shift1 == Shift2 || ~Shift1 == 0 || ~Shift2 == 0);
30197-
EveryOtherShiftAmt.push_back(~Shift1 ? Shift1 : Shift2);
30199+
assert(Shift1 == Shift2 || Shift1 == (uint8_t) ~0 ||
30200+
Shift2 == (uint8_t) ~0);
30201+
EveryOtherShiftAmt.push_back(Shift1 != (uint8_t) ~0 ? Shift1 : Shift2);
3019830202
}
3019930203
SmallVector<int, 32> Permutation2;
3020030204
for (size_t I = 0; I < NumLanes; ++I) {
@@ -30214,51 +30218,36 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3021430218
}
3021530219
}
3021630220

30217-
// For right shifts, (V)PMULHUW needs an extra instruction to handle an
30218-
// amount of 0, disabling the transformation here to be cautious.
30221+
// For right shifts, (V)PMULHUW needs 2 extra instructions to handle an
30222+
// amount of 0, making it unprofitable.
3021930223
if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
3022030224
any_of(ShiftAmt, [](auto x) { return x == 0; }))
3022130225
Profitable = false;
3022230226

3022330227
bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
30224-
// If operand R is not a shuffle by itself, the transformation here adds two
30225-
// shuffles, adding a non-trivial cost. Here we take out a few cases where
30226-
// the benefit is questionable according to llvm-mca's modeling.
30227-
//
30228-
// Each cell shows latency before/after transform. Here R is not a shuffle.
30229-
// SSE3
30230-
// | v16i8 | v32i8 | v64i8
30231-
// ----------------------------
30232-
// SLL | 17/17 | 20/20 | 26/26
30233-
// SRL | 18/17 | 22/20 | 35/26
30234-
// SRA | 21/19 | 26/22 | 39/30
30235-
// AVX2 using VPMUL*W
30236-
// | v16i8 | v32i8 | v64i8
30237-
// ----------------------------
30238-
// SLL | 20/18 | 18/18 | 21/21
30239-
// SRL | 20/18 | 22/18 | 26/21
30240-
// SRA | 20/20 | 22/20 | 25/23
30241-
// AVX2 using VPS*LVD
30242-
// | v16i8 | v32i8 | v64i8
30243-
// ----------------------------
30244-
// SLL | 20/16 | 18/16 | 21/20
30245-
// SRL | 20/16 | 22/16 | 26/20
30246-
// SRA | 20/18 | 22/18 | 25/22
30228+
// If operand R is a shuffle, one of the two shuffles introduced by this
30229+
// transformation can be merged with it, and the extrast shuffle is 1 cycle.
30230+
// This is generally profitable because it eliminates one (or both) vector
30231+
// multiplication, which has to be scheduled at least 1 cycle apart.
30232+
// If operand R is not a shuffle, several cases are not profitable based on
30233+
// pipeline modeling, so we are excluding them here.
3024730234
if (!IsOperandShuffle) {
30248-
if (Subtarget.hasAVX2()) {
30249-
if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
30235+
// A hack to detect AMD CPU.
30236+
if (Subtarget.hasSSE4A() && Opc == ISD::SRA) {
30237+
if (Opc == ISD::SRA)
3025030238
Profitable = false;
3025130239
} else {
30252-
if (Opc == ISD::SHL ||
30253-
((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
30240+
if ((Subtarget.hasAVX() && !Subtarget.hasAVX2()) ||
30241+
(Subtarget.hasAVX2() && !IsAdjacentQuads))
3025430242
Profitable = false;
3025530243
}
3025630244
}
3025730245

3025830246
// Found a permutation P that can rearrange the shift amouts into adjacent
3025930247
// pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
3026030248
if (Profitable) {
30261-
SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
30249+
SDValue InnerShuffle =
30250+
DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
3026230251
SmallVector<SDValue, 64> NewShiftAmt;
3026330252
for (int Index : Permutation) {
3026430253
NewShiftAmt.push_back(Amt.getOperand(Index));
@@ -30267,7 +30256,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3026730256
for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
3026830257
SDValue Even = NewShiftAmt[I];
3026930258
SDValue Odd = NewShiftAmt[I + 1];
30270-
assert(Even.isUndef() || Odd.isUndef() || Even->getAsZExtVal() == Odd->getAsZExtVal());
30259+
assert(Even.isUndef() || Odd.isUndef() ||
30260+
Even->getAsZExtVal() == Odd->getAsZExtVal());
3027130261
}
3027230262
#endif
3027330263
SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
@@ -30276,7 +30266,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3027630266
for (size_t I = 0; I < Permutation.size(); ++I) {
3027730267
InversePermutation[Permutation[I]] = I;
3027830268
}
30279-
SDValue OuterShuffle = DAG.getVectorShuffle(VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
30269+
SDValue OuterShuffle = DAG.getVectorShuffle(
30270+
VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
3028030271
return OuterShuffle;
3028130272
}
3028230273
}

0 commit comments

Comments
 (0)