Skip to content

Commit 0a0f480

Browse files
committed
Second version: more cpu latency measurement with llvm-mca
1 parent a398aae commit 0a0f480

File tree

1 file changed

+98
-76
lines changed

1 file changed

+98
-76
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 98 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -29769,24 +29769,27 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
2976929769
// Given a vector of values, find a permutation such that every adjacent even-
2977029770
// odd pair has the same value. ~0 is reserved as a special value for wildcard,
2977129771
// which can be paired with any value. Returns true if a permutation is found.
29772+
// If output Permutation is not empty, permutation index starts at its previous
29773+
// size, so that this function can concatenate the result of multiple calls.
29774+
// UnpairedInputs contains values yet to be paired, mapping an unpaired value to
29775+
// its current neighbor's value and index.
29776+
// Do not use llvm::DenseMap as ~0 is reserved key.
2977229777
template <typename InputTy,
2977329778
typename PermutationTy,
29774-
typename MapTy = std::unordered_map<typename InputTy::value_type,
29775-
std::pair<typename InputTy::value_type, typename PermutationTy::value_type>>>
29779+
typename MapTy = SmallMapVector<typename InputTy::value_type,
29780+
std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>>
2977629781
static bool PermuteAndPairVector(const InputTy& Inputs,
29777-
PermutationTy &Permutation) {
29782+
PermutationTy &Permutation,
29783+
MapTy UnpairedInputs = SmallMapVector<typename InputTy::value_type,
29784+
std::pair<typename InputTy::value_type, typename PermutationTy::value_type>, 8>()) {
2977829785
const auto Wildcard = ~typename InputTy::value_type();
29779-
29780-
// List of values to be paired, mapping an unpaired value to its current
29781-
// neighbor's value and index.
29782-
MapTy UnpairedInputs;
2978329786
SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
2978429787

29785-
Permutation.clear();
29788+
size_t OutputOffset = Permutation.size();
2978629789
typename PermutationTy::value_type I = 0;
2978729790
for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end(); InputIt != InputEnd;) {
29788-
Permutation.push_back(I);
29789-
Permutation.push_back(I + 1);
29791+
Permutation.push_back(OutputOffset + I);
29792+
Permutation.push_back(OutputOffset + I + 1);
2979029793

2979129794
auto Even = *InputIt++;
2979229795
assert(InputIt != InputEnd && "Expected even number of elements");
@@ -29806,7 +29809,7 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
2980629809
// value's neighbor, otherwise the current value is added to the map.
2980729810
if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(This, std::make_pair(Other, OtherIndex)); !Inserted) {
2980829811
auto [SwapValue, SwapIndex] = MapIt->second;
29809-
std::swap(Permutation[SwapIndex], Permutation[ThisIndex]);
29812+
std::swap(Permutation[OutputOffset + SwapIndex], Permutation[OutputOffset + ThisIndex]);
2981029813
This = SwapValue;
2981129814
UnpairedInputs.erase(MapIt);
2981229815

@@ -29850,13 +29853,12 @@ static bool PermuteAndPairVector(const InputTy& Inputs,
2985029853
if (Neighbor != Wildcard) {
2985129854
assert(UnpairedInputs.count(Neighbor));
2985229855
if (WildcardPairs.size()) {
29853-
std::swap(Permutation[WildcardPairs.back()], Permutation[NeighborIndex]);
29856+
std::swap(Permutation[OutputOffset + WildcardPairs.back()], Permutation[OutputOffset + NeighborIndex]);
2985429857
WildcardPairs.pop_back();
2985529858
// Mark the neighbor as processed.
2985629859
UnpairedInputs[Neighbor].first = Wildcard;
29857-
} else {
29860+
} else
2985829861
return false;
29859-
}
2986029862
}
2986129863
}
2986229864
return true;
@@ -30140,87 +30142,107 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
3014030142
}
3014130143
}
3014230144

30143-
// ISD::SRA/SRL/SHL on vXi8 can be widened to vYi16 (Y = X/2) if the constant
30144-
// amounts can be shuffled such that every pair of adjacent elements has the
30145-
// same value. This introduces an extra shuffle before and after the shift,
30146-
// and it is profitable if the operand is aready a shuffle so that both can
30147-
// be merged, or if the extra shuffle is fast (can use VPSHUFB).
30145+
// SHL/SRL/SRA on vXi8 can be widened to vYi16 or vYi32 if the constant
30146+
// amounts can be shuffled such that every pair or quad of adjacent elements
30147+
// has the same value. This introduces an extra shuffle before and after the
30148+
// shift, and it is profitable if the operand is aready a shuffle so that both
30149+
// can be merged and the extra shuffle is fast. This is not profitable on
30150+
// AVX512 becasue it has 16-bit vector variable shift instruction VPS**VW.
3014830151
// (shift (shuffle X P1) S1) ->
3014930152
// (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
3015030153
// widened, and P2^-1 is the inverse shuffle of P2.
30151-
if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse() && Subtarget.hasSSE3()) {
30152-
bool Profitable = true;
30153-
// VPAND ymm only available on AVX2.
30154-
if (VT == MVT::v32i8 || VT == MVT::v64i8) {
30155-
Profitable = Subtarget.hasAVX2();
30156-
}
30154+
if (ConstantAmt && (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) && R.hasOneUse()
30155+
&& Subtarget.hasSSE3() && !Subtarget.hasAVX512()) {
30156+
constexpr size_t LaneBytes = 16;
30157+
const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
3015730158

3015830159
SmallVector<int, 64> Permutation;
30159-
SmallVector<uint16_t, 64> ShiftAmt;
30160+
SmallVector<uint8_t, 64> ShiftAmt;
3016030161
for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
3016130162
if (Amt.getOperand(I).isUndef())
3016230163
ShiftAmt.push_back(~0);
3016330164
else
3016430165
ShiftAmt.push_back(Amt.getConstantOperandVal(I));
3016530166
}
3016630167

30167-
if (Profitable && (VT == MVT::v32i8 || VT == MVT::v64i8)) {
30168-
Profitable = false;
30169-
constexpr size_t LaneBytes = 16;
30170-
const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
30171-
30172-
// For v32i8 or v64i8, we should check if we can generate a shuffle that
30173-
// may be lowered to VPSHUFB, because it is faster than VPERMB. This is
30174-
// possible if we can apply the same shuffle mask to each v16i8 lane.
30175-
// For example (assuming a lane has 4 elements for simplicity),
30176-
// <1, 2, 2, 1, 4, 3, 3, 4> is handled as <14, 23, 23, 14>, which can
30177-
// be shuffled to adjacent pairs <14, 14, 23, 23> with the VPSHUFB mask
30178-
// <0, 3, 2, 1> (or high level mask <0, 3, 2, 1, 4, 7, 6, 5>).
30179-
// Limitation: if there are some undef in shift amounts, this algorithm
30180-
// may not find a solution even if one exists, as here we only treat a
30181-
// VPSHUFB index as undef if all shuffle amounts of the same index modulo
30182-
// lane size are all undef.
30183-
// Since a byte can only be shifted by 7 bits without being UB, 4 bits are
30184-
// enough to represent the shift amount or undef (0xF).
30185-
std::array<uint16_t, LaneBytes> VPSHUFBShiftAmt = {};
30186-
for (size_t I = 0; I < LaneBytes; ++I)
30187-
for (size_t J = 0; J < NumLanes; ++J)
30188-
VPSHUFBShiftAmt[I] |= (ShiftAmt[I + J * LaneBytes] & 0xF) << (J * 4);
30189-
if (VT == MVT::v32i8) {
30190-
for (size_t I = 0; I < LaneBytes; ++I)
30191-
VPSHUFBShiftAmt[I] |= 0xFF00;
30192-
}
30193-
if (PermuteAndPairVector(VPSHUFBShiftAmt, Permutation)) {
30194-
// Found a VPSHUFB solution, offset the shuffle amount to other lanes.
30195-
Permutation.resize(VT.getVectorNumElements());
30196-
for (size_t I = 0; I < LaneBytes; ++I)
30197-
for (size_t J = 1; J < NumLanes; ++J)
30198-
Permutation[I + J * LaneBytes] = Permutation[I] + J * LaneBytes;
30199-
Profitable = true;
30200-
} else if (R.getOpcode() == ISD::VECTOR_SHUFFLE) {
30201-
// A slower shuffle is profitable if the operand is also a slow shuffle,
30202-
// such that they can be merged.
30203-
// TODO: Use TargetTransformInfo to systematically determine whether
30204-
// inner shuffle is slow. Currently we only check if it contains
30205-
// cross-lane shuffle.
30206-
if (ShuffleVectorSDNode *InnerShuffle = dyn_cast<ShuffleVectorSDNode>(R.getNode())) {
30207-
if (InnerShuffle->getMask().size() == VT.getVectorNumElements() &&
30208-
is128BitLaneCrossingShuffleMask(VT, InnerShuffle->getMask()))
30209-
Profitable = true;
30168+
// Check if we can find an in-lane shuffle to rearrange the shift amounts,
30169+
// if so, this transformation may be profitable.
30170+
bool Profitable;
30171+
for (size_t I = 0; I < NumLanes; ++I) {
30172+
if (!(Profitable = PermuteAndPairVector(ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
30173+
break;
30174+
}
30175+
30176+
// For AVX2, check if we can further rearrange shift amounts into adjacent
30177+
// quads, so that it can use VPS*LVD instead of VPMUL*W as it is 2 cycles
30178+
// faster.
30179+
bool IsAdjacentQuads = false;
30180+
if (Profitable && Subtarget.hasAVX2()) {
30181+
SmallVector<uint8_t, 64> EveryOtherShiftAmt;
30182+
for (size_t I = 0; I < Permutation.size(); I += 2) {
30183+
uint8_t Shift1 = ShiftAmt[Permutation[I]];
30184+
uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
30185+
assert(Shift1 == Shift2 || ~Shift1 == 0 || ~Shift2 == 0);
30186+
EveryOtherShiftAmt.push_back(~Shift1 ? Shift1 : Shift2);
30187+
}
30188+
SmallVector<int, 32> Permutation2;
30189+
for (size_t I = 0; I < NumLanes; ++I) {
30190+
if (!(IsAdjacentQuads = PermuteAndPairVector(ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2], LaneBytes / 2), Permutation2)))
30191+
break;
30192+
}
30193+
if (IsAdjacentQuads) {
30194+
SmallVector<int, 64> CombinedPermutation;
30195+
for (int Index : Permutation2) {
30196+
CombinedPermutation.push_back(Permutation[Index * 2]);
30197+
CombinedPermutation.push_back(Permutation[Index * 2 + 1]);
3021030198
}
30199+
std::swap(Permutation, CombinedPermutation);
3021130200
}
3021230201
}
3021330202

30214-
// If it is still profitable at this point, and has not found a permutation
30215-
// yet, try again with any shuffle index.
30216-
if (Profitable && Permutation.empty()) {
30217-
PermuteAndPairVector<decltype(ShiftAmt), decltype(Permutation),
30218-
SmallMapVector<uint16_t, std::pair<uint16_t, int>, 8>>(ShiftAmt, Permutation);
30203+
// For right shifts, (V)PMULHUW needs an extra instruction to handle an
30204+
// amount of 0, disabling the transformation here to be cautious.
30205+
if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
30206+
any_of(ShiftAmt, [](auto x) { return x == 0; }))
30207+
Profitable = false;
30208+
30209+
bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
30210+
// If operand R is not a shuffle by itself, the transformation here adds two
30211+
// shuffles, adding a non-trivial cost. Here we take out a few cases where
30212+
// the benefit is questionable according to llvm-mca's modeling.
30213+
//
30214+
// Each cell shows latency before/after transform. Here R is not a shuffle.
30215+
// SSE3
30216+
// | v16i8 | v32i8 | v64i8
30217+
// ----------------------------
30218+
// SLL | 17/17 | 20/20 | 26/26
30219+
// SRL | 18/17 | 22/20 | 35/26
30220+
// SRA | 21/19 | 26/22 | 39/30
30221+
// AVX2 using VPMUL*W
30222+
// | v16i8 | v32i8 | v64i8
30223+
// ----------------------------
30224+
// SLL | 20/18 | 18/18 | 21/21
30225+
// SRL | 20/18 | 22/18 | 26/21
30226+
// SRA | 20/20 | 22/20 | 25/23
30227+
// AVX2 using VPS*LVD
30228+
// | v16i8 | v32i8 | v64i8
30229+
// ----------------------------
30230+
// SLL | 20/16 | 18/16 | 21/20
30231+
// SRL | 20/16 | 22/16 | 26/20
30232+
// SRA | 20/18 | 22/18 | 25/22
30233+
if (!IsOperandShuffle) {
30234+
if (Subtarget.hasAVX2()) {
30235+
if (!IsAdjacentQuads || (VT == MVT::v64i8 && Opc == ISD::SHL))
30236+
Profitable = false;
30237+
} else {
30238+
if (Opc == ISD::SHL || ((VT == MVT::v16i8 || VT == MVT::v32i8) && Opc == ISD::SRL))
30239+
Profitable = false;
30240+
}
3021930241
}
3022030242

3022130243
// Found a permutation P that can rearrange the shift amouts into adjacent
30222-
// pair of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
30223-
if (!Permutation.empty()) {
30244+
// pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
30245+
if (Profitable) {
3022430246
SDValue InnerShuffle = DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
3022530247
SmallVector<SDValue, 64> NewShiftAmt;
3022630248
for (int Index : Permutation) {

0 commit comments

Comments
 (0)