Skip to content

[X86][Codegen] Shuffle certain shifts on i8 vectors to create opportunity for vectorized shift instructions #117980

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
247 changes: 246 additions & 1 deletion llvm/lib/Target/X86/X86ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/ObjCARCUtil.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/VectorUtils.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
Expand Down Expand Up @@ -29766,6 +29765,113 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
return SDValue();
}

// Given a vector of values, find a permutation such that every adjacent even-
// odd pair has the same value. ~0 is reserved as a special value for wildcard,
// which can be paired with any value. Returns true if a permutation is found.
// If output Permutation is not empty, permutation index starts at its previous
// size, so that this function can concatenate the result of multiple calls.
// UnpairedInputs contains values yet to be paired, mapping an unpaired value to
// its current neighbor's value and index.
// Do not use llvm::DenseMap as ~0 is reserved key.
template <typename InputTy, typename PermutationTy,
typename MapTy =
SmallMapVector<typename InputTy::value_type,
std::pair<typename InputTy::value_type,
typename PermutationTy::value_type>,
8>>
static bool PermuteAndPairVector(
const InputTy &Inputs, PermutationTy &Permutation,
MapTy UnpairedInputs = MapTy()) {
const typename InputTy::value_type Wildcard = ~0;
SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;

size_t OutputOffset = Permutation.size();
typename PermutationTy::value_type I = 0;
for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end();
InputIt != InputEnd;) {
Permutation.push_back(OutputOffset + I);
Permutation.push_back(OutputOffset + I + 1);

auto Even = *InputIt++;
assert(InputIt != InputEnd && "Expected even number of elements");
auto Odd = *InputIt++;

// If both are wildcards, note it for later use by unpairable values.
if (Even == Wildcard && Odd == Wildcard) {
WildcardPairs.push_back(I);
}

// If both are equal, they are in good position.
if (Even != Odd) {
auto DoWork = [&](auto &This, auto ThisIndex, auto Other,
auto OtherIndex) {
if (This != Wildcard) {
// For non-wildcard value, check if it can pair with an exisiting
// unpaired value from UnpairedInputs, if so, swap with the unpaired
// value's neighbor, otherwise the current value is added to the map.
if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(
This, std::make_pair(Other, OtherIndex));
!Inserted) {
auto [SwapValue, SwapIndex] = MapIt->second;
std::swap(Permutation[OutputOffset + SwapIndex],
Permutation[OutputOffset + ThisIndex]);
This = SwapValue;
UnpairedInputs.erase(MapIt);

if (This == Other) {
if (This == Wildcard) {
// We freed up a wildcard pair by pairing two non-adjacent
// values, note it for later use by unpairable values.
WildcardPairs.push_back(I);
} else {
// The swapped element also forms a pair with Other, so it can
// be removed from the map.
assert(UnpairedInputs.count(This));
UnpairedInputs.erase(This);
}
} else {
// Swapped in an unpaired value, update its info.
if (This != Wildcard) {
assert(UnpairedInputs.count(This));
UnpairedInputs[This] = std::make_pair(Other, OtherIndex);
}
// If its neighbor is also in UnpairedInputs, update its info too.
if (auto OtherMapIt = UnpairedInputs.find(Other);
OtherMapIt != UnpairedInputs.end() &&
OtherMapIt->second.second == ThisIndex) {
OtherMapIt->second.first = This;
}
}
}
}
};
DoWork(Even, I, Odd, I + 1);
if (Even != Odd) {
DoWork(Odd, I + 1, Even, I);
}
}
I += 2;
}

// Now check if each remaining unpaired neighboring values can be swapped with
// a wildcard pair to form two paired values.
for (auto &[Unpaired, V] : UnpairedInputs) {
auto [Neighbor, NeighborIndex] = V;
if (Neighbor != Wildcard) {
assert(UnpairedInputs.count(Neighbor));
if (WildcardPairs.size()) {
std::swap(Permutation[OutputOffset + WildcardPairs.back()],
Permutation[OutputOffset + NeighborIndex]);
WildcardPairs.pop_back();
// Mark the neighbor as processed.
UnpairedInputs[Neighbor].first = Wildcard;
} else
return false;
}
}
return true;
}

static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
MVT VT = Op.getSimpleValueType();
Expand Down Expand Up @@ -30044,6 +30150,145 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
}
}

// SHL/SRL/SRA on vXi8 can be widened to vYi16 or vYi32 if the constant
// amounts can be shuffled such that every pair or quad of adjacent elements
// has the same value. This introduces an extra shuffle before and after the
// shift, and it is profitable if the operand is aready a shuffle so that both
// can be merged or the extra shuffle is fast.
// (shift (shuffle X P1) S1) ->
// (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
// widened, and P2^-1 is the inverse shuffle of P2.
// This is not profitable on XOP or AVX512 becasue it has 8/16-bit vector
// variable shift instructions.
// Picking out GFNI because normally it implies AVX512, and there is no
// latency data for CPU with GFNI and SSE or AVX only, but there are tests for
// such combination anyways.
if (ConstantAmt &&
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The lowering scheme immediately above this is very similar to what you're doing (and a lot easier to grok) - I'd recommend you look at extending that code instead of introducing this separate implementation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code above is to handle shift widening when adjacent pairs have same shift amount. My patch tries to find a permutation to create such shift, but does not perform widening itself (and hand it to the code above), so it is in fact a different functionality and better left in a separate section

(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
R.hasOneUse() && Subtarget.hasSSSE3() && !Subtarget.hasAVX512() &&
!Subtarget.hasXOP() && !Subtarget.hasGFNI()) {
constexpr size_t LaneBytes = 16;
const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;

SmallVector<int, 64> Permutation;
SmallVector<uint8_t, 64> ShiftAmt;
for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
if (Amt.getOperand(I).isUndef())
ShiftAmt.push_back(~0);
else {
auto A = Amt.getConstantOperandVal(I);
ShiftAmt.push_back(A > 8 ? 8 : A);
}
}

// Check if we can find an in-lane shuffle to rearrange the shift amounts,
// if so, this transformation may be profitable. Cross-lane shuffle is
// almost never profitable because there is no general 1-instruction
// solution.
bool Profitable;
for (size_t I = 0; I < NumLanes; ++I) {
if (!(Profitable = PermuteAndPairVector(
ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
break;
}

// For AVX2, check if we can further rearrange shift amounts into adjacent
// quads, so that it can use VPS*LVD instead of VPMUL*W as it is 2 cycles
// faster.
bool IsAdjacentQuads = false;
if (Profitable && Subtarget.hasAVX2()) {
SmallVector<uint8_t, 64> EveryOtherShiftAmt;
for (size_t I = 0; I < Permutation.size(); I += 2) {
uint8_t Shift1 = ShiftAmt[Permutation[I]];
uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
assert(Shift1 == Shift2 || Shift1 == (uint8_t) ~0 ||
Shift2 == (uint8_t) ~0);
EveryOtherShiftAmt.push_back(Shift1 != (uint8_t) ~0 ? Shift1 : Shift2);
}
SmallVector<int, 32> Permutation2;
for (size_t I = 0; I < NumLanes; ++I) {
if (!(IsAdjacentQuads = PermuteAndPairVector(
ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2],
LaneBytes / 2),
Permutation2)))
break;
}
if (IsAdjacentQuads) {
SmallVector<int, 64> CombinedPermutation;
for (int Index : Permutation2) {
CombinedPermutation.push_back(Permutation[Index * 2]);
CombinedPermutation.push_back(Permutation[Index * 2 + 1]);
}
std::swap(Permutation, CombinedPermutation);
}
}

// For right shifts, (V)PMULHUW needs 2 extra instructions to handle an
// amount of 0, making it unprofitable.
if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
any_of(ShiftAmt, [](uint8_t x) { return x == 0; }))
Profitable = false;

bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
// If operand R is a shuffle, one of the two shuffles introduced by this
// transformation can be merged with it, and the extrast shuffle is 1 cycle.
// This is generally profitable because it eliminates one (or both) vector
// multiplication, which has to be scheduled at least 1 cycle apart.
// If operand R is not a shuffle, several cases are not profitable based on
// pipeline modeling, so we are excluding them here.
if (!IsOperandShuffle) {
// A hack to detect AMD Zen series CPU.
if (Subtarget.hasSSE4A()) {
if (!IsAdjacentQuads)
Profitable = false;
// A hack to detect Zen+ and Zen 2, because VPSRLVD is 2 cycles slower
// than in Zen 3, so this transformation should not be used.
else if (!Subtarget.hasVAES())
Profitable = false;
} else {
if ((Subtarget.hasAVX() && !Subtarget.hasAVX2()) ||
(Subtarget.hasAVX2() && !IsAdjacentQuads))
Profitable = false;
}
}

// If the shuffle is identity, do not insert it. It also prevents this
// transformation from being applied recursively.
if (llvm::equal(Permutation, llvm::seq(Permutation.size())))
Profitable = false;

// Found a permutation P that can rearrange the shift amouts into adjacent
// pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
if (Profitable) {
SDValue InnerShuffle =
DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
SmallVector<SDValue, 64> NewShiftAmt;
for (int Index : Permutation) {
NewShiftAmt.push_back(Amt.getOperand(Index));
}
// If using (V)PMULHUW, any undef pair is resolved to shift by 8 so that
// it does not create extra instructions in case it is resolved to 0.
for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
SDValue &Even = NewShiftAmt[I];
SDValue &Odd = NewShiftAmt[I + 1];
assert(Even.isUndef() || Odd.isUndef() ||
Even->getAsZExtVal() == Odd->getAsZExtVal());
if (!IsAdjacentQuads && Even.isUndef() && Odd.isUndef())
Even = DAG.getConstant(8, dl, VT.getScalarType());
}

SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
SDValue NewShift = DAG.getNode(Opc, dl, VT, InnerShuffle, NewShiftVector);
SmallVector<int, 64> InversePermutation(Permutation.size());
for (size_t I = 0; I < Permutation.size(); ++I) {
InversePermutation[Permutation[I]] = I;
}
SDValue OuterShuffle = DAG.getVectorShuffle(
VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
return OuterShuffle;
}
}

// If possible, lower this packed shift into a vector multiply instead of
// expanding it into a sequence of scalar shifts.
// For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.
Expand Down
Loading
Loading