llvm · huangjd · Nov 21, 2024 · Nov 28, 2024 · Nov 28, 2024 · Dec 13, 2024
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -28,7 +28,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/ObjCARCUtil.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -29766,6 +29765,113 @@ static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl,
   return SDValue();
 }
 
+// Given a vector of values, find a permutation such that every adjacent even-
+// odd pair has the same value. ~0 is reserved as a special value for wildcard,
+// which can be paired with any value. Returns true if a permutation is found.
+// If output Permutation is not empty, permutation index starts at its previous
+// size, so that this function can concatenate the result of multiple calls.
+// UnpairedInputs contains values yet to be paired, mapping an unpaired value to
+// its current neighbor's value and index.
+// Do not use llvm::DenseMap as ~0 is reserved key.
+template <typename InputTy, typename PermutationTy,
+          typename MapTy =
+              SmallMapVector<typename InputTy::value_type,
+                             std::pair<typename InputTy::value_type,
+                                       typename PermutationTy::value_type>,
+                             8>>
+static bool PermuteAndPairVector(
+    const InputTy &Inputs, PermutationTy &Permutation,
+    MapTy UnpairedInputs = MapTy()) {
+  const typename InputTy::value_type Wildcard = ~0;
+  SmallVector<typename PermutationTy::value_type, 16> WildcardPairs;
+
+  size_t OutputOffset = Permutation.size();
+  typename PermutationTy::value_type I = 0;
+  for (auto InputIt = Inputs.begin(), InputEnd = Inputs.end();
+       InputIt != InputEnd;) {
+    Permutation.push_back(OutputOffset + I);
+    Permutation.push_back(OutputOffset + I + 1);
+
+    auto Even = *InputIt++;
+    assert(InputIt != InputEnd && "Expected even number of elements");
+    auto Odd = *InputIt++;
+
+    // If both are wildcards, note it for later use by unpairable values.
+    if (Even == Wildcard && Odd == Wildcard) {
+      WildcardPairs.push_back(I);
+    }
+
+    // If both are equal, they are in good position.
+    if (Even != Odd) {
+      auto DoWork = [&](auto &This, auto ThisIndex, auto Other,
+                        auto OtherIndex) {
+        if (This != Wildcard) {
+          // For non-wildcard value, check if it can pair with an exisiting
+          // unpaired value from UnpairedInputs, if so, swap with the unpaired
+          // value's neighbor, otherwise the current value is added to the map.
+          if (auto [MapIt, Inserted] = UnpairedInputs.try_emplace(
+                  This, std::make_pair(Other, OtherIndex));
+              !Inserted) {
+            auto [SwapValue, SwapIndex] = MapIt->second;
+            std::swap(Permutation[OutputOffset + SwapIndex],
+                      Permutation[OutputOffset + ThisIndex]);
+            This = SwapValue;
+            UnpairedInputs.erase(MapIt);
+
+            if (This == Other) {
+              if (This == Wildcard) {
+                // We freed up a wildcard pair by pairing two non-adjacent
+                // values, note it for later use by unpairable values.
+                WildcardPairs.push_back(I);
+              } else {
+                // The swapped element also forms a pair with Other, so it can
+                // be removed from the map.
+                assert(UnpairedInputs.count(This));
+                UnpairedInputs.erase(This);
+              }
+            } else {
+              // Swapped in an unpaired value, update its info.
+              if (This != Wildcard) {
+                assert(UnpairedInputs.count(This));
+                UnpairedInputs[This] = std::make_pair(Other, OtherIndex);
+              }
+              // If its neighbor is also in UnpairedInputs, update its info too.
+              if (auto OtherMapIt = UnpairedInputs.find(Other);
+                  OtherMapIt != UnpairedInputs.end() &&
+                  OtherMapIt->second.second == ThisIndex) {
+                OtherMapIt->second.first = This;
+              }
+            }
+          }
+        }
+      };
+      DoWork(Even, I, Odd, I + 1);
+      if (Even != Odd) {
+        DoWork(Odd, I + 1, Even, I);
+      }
+    }
+    I += 2;
+  }
+
+  // Now check if each remaining unpaired neighboring values can be swapped with
+  // a wildcard pair to form two paired values.
+  for (auto &[Unpaired, V] : UnpairedInputs) {
+    auto [Neighbor, NeighborIndex] = V;
+    if (Neighbor != Wildcard) {
+      assert(UnpairedInputs.count(Neighbor));
+      if (WildcardPairs.size()) {
+        std::swap(Permutation[OutputOffset + WildcardPairs.back()],
+                  Permutation[OutputOffset + NeighborIndex]);
+        WildcardPairs.pop_back();
+        // Mark the neighbor as processed.
+        UnpairedInputs[Neighbor].first = Wildcard;
+      } else
+        return false;
+    }
+  }
+  return true;
+}
+
 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                           SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
@@ -30044,6 +30150,145 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
     }
   }
 
+  // SHL/SRL/SRA on vXi8 can be widened to vYi16 or vYi32 if the constant
+  // amounts can be shuffled such that every pair or quad of adjacent elements
+  // has the same value. This introduces an extra shuffle before and after the
+  // shift, and it is profitable if the operand is aready a shuffle so that both
+  // can be merged or the extra shuffle is fast.
+  // (shift (shuffle X P1) S1) ->
+  // (shuffle (shift (shuffle X (shuffle P2 P1)) S2) P2^-1) where S2 can be
+  // widened, and P2^-1 is the inverse shuffle of P2.
+  // This is not profitable on XOP or AVX512 becasue it has 8/16-bit vector
+  // variable shift instructions.
+  // Picking out GFNI because normally it implies AVX512, and there is no
+  // latency data for CPU with GFNI and SSE or AVX only, but there are tests for
+  // such combination anyways.
+  if (ConstantAmt &&
+      (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) &&
+      R.hasOneUse() && Subtarget.hasSSSE3() && !Subtarget.hasAVX512() &&
+      !Subtarget.hasXOP() && !Subtarget.hasGFNI()) {
+    constexpr size_t LaneBytes = 16;
+    const size_t NumLanes = VT.getVectorNumElements() / LaneBytes;
+
+    SmallVector<int, 64> Permutation;
+    SmallVector<uint8_t, 64> ShiftAmt;
+    for (size_t I = 0; I < Amt.getNumOperands(); ++I) {
+      if (Amt.getOperand(I).isUndef())
+        ShiftAmt.push_back(~0);
+      else {
+        auto A = Amt.getConstantOperandVal(I);
+        ShiftAmt.push_back(A > 8 ? 8 : A);
+      }
+    }
+
+    // Check if we can find an in-lane shuffle to rearrange the shift amounts,
+    // if so, this transformation may be profitable. Cross-lane shuffle is
+    // almost never profitable because there is no general 1-instruction
+    // solution.
+    bool Profitable;
+    for (size_t I = 0; I < NumLanes; ++I) {
+      if (!(Profitable = PermuteAndPairVector(
+                ArrayRef(&ShiftAmt[I * LaneBytes], LaneBytes), Permutation)))
+        break;
+    }
+
+    // For AVX2, check if we can further rearrange shift amounts into adjacent
+    // quads, so that it can use VPS*LVD instead of VPMUL*W as it is 2 cycles
+    // faster.
+    bool IsAdjacentQuads = false;
+    if (Profitable && Subtarget.hasAVX2()) {
+      SmallVector<uint8_t, 64> EveryOtherShiftAmt;
+      for (size_t I = 0; I < Permutation.size(); I += 2) {
+        uint8_t Shift1 = ShiftAmt[Permutation[I]];
+        uint8_t Shift2 = ShiftAmt[Permutation[I + 1]];
+        assert(Shift1 == Shift2 || Shift1 == (uint8_t) ~0 ||
+               Shift2 == (uint8_t) ~0);
+        EveryOtherShiftAmt.push_back(Shift1 != (uint8_t) ~0 ? Shift1 : Shift2);
+      }
+      SmallVector<int, 32> Permutation2;
+      for (size_t I = 0; I < NumLanes; ++I) {
+        if (!(IsAdjacentQuads = PermuteAndPairVector(
+                  ArrayRef(&EveryOtherShiftAmt[I * LaneBytes / 2],
+                           LaneBytes / 2),
+                  Permutation2)))
+          break;
+      }
+      if (IsAdjacentQuads) {
+        SmallVector<int, 64> CombinedPermutation;
+        for (int Index : Permutation2) {
+          CombinedPermutation.push_back(Permutation[Index * 2]);
+          CombinedPermutation.push_back(Permutation[Index * 2 + 1]);
+        }
+        std::swap(Permutation, CombinedPermutation);
+      }
+    }
+
+    // For right shifts, (V)PMULHUW needs 2 extra instructions to handle an
+    // amount of 0, making it unprofitable.
+    if (!IsAdjacentQuads && (Opc == ISD::SRL || Opc == ISD::SRA) &&
+        any_of(ShiftAmt, [](uint8_t x) { return x == 0; }))
+      Profitable = false;
+
+    bool IsOperandShuffle = R.getOpcode() == ISD::VECTOR_SHUFFLE;
+    // If operand R is a shuffle, one of the two shuffles introduced by this
+    // transformation can be merged with it, and the extrast shuffle is 1 cycle.
+    // This is generally profitable because it eliminates one (or both) vector
+    // multiplication, which has to be scheduled at least 1 cycle apart.
+    // If operand R is not a shuffle, several cases are not profitable based on
+    // pipeline modeling, so we are excluding them here.
+    if (!IsOperandShuffle) {
+      // A hack to detect AMD Zen series CPU.
+      if (Subtarget.hasSSE4A()) {
+        if (!IsAdjacentQuads)
+          Profitable = false;
+        // A hack to detect Zen+ and Zen 2, because VPSRLVD is 2 cycles slower
+        // than in Zen 3, so this transformation should not be used.
+        else if (!Subtarget.hasVAES())
+          Profitable = false;
+      } else {
+        if ((Subtarget.hasAVX() && !Subtarget.hasAVX2()) ||
+            (Subtarget.hasAVX2() && !IsAdjacentQuads))
+          Profitable = false;
+      }
+    }
+
+    // If the shuffle is identity, do not insert it. It also prevents this
+    // transformation from being applied recursively.
+    if (llvm::equal(Permutation, llvm::seq(Permutation.size())))
+      Profitable = false;
+
+    // Found a permutation P that can rearrange the shift amouts into adjacent
+    // pair or quad of same values. Rewrite the shift S1(x) into P^-1(S2(P(x))).
+    if (Profitable) {
+      SDValue InnerShuffle =
+          DAG.getVectorShuffle(VT, dl, R, DAG.getUNDEF(VT), Permutation);
+      SmallVector<SDValue, 64> NewShiftAmt;
+      for (int Index : Permutation) {
+        NewShiftAmt.push_back(Amt.getOperand(Index));
+      }
+      // If using (V)PMULHUW, any undef pair is resolved to shift by 8 so that
+      // it does not create extra instructions in case it is resolved to 0.
+      for (size_t I = 0; I < NewShiftAmt.size(); I += 2) {
+        SDValue &Even = NewShiftAmt[I];
+        SDValue &Odd = NewShiftAmt[I + 1];
+        assert(Even.isUndef() || Odd.isUndef() ||
+               Even->getAsZExtVal() == Odd->getAsZExtVal());
+        if (!IsAdjacentQuads && Even.isUndef() && Odd.isUndef())
+          Even = DAG.getConstant(8, dl, VT.getScalarType());
+      }
+
+      SDValue NewShiftVector = DAG.getBuildVector(VT, dl, NewShiftAmt);
+      SDValue NewShift = DAG.getNode(Opc, dl, VT, InnerShuffle, NewShiftVector);
+      SmallVector<int, 64> InversePermutation(Permutation.size());
+      for (size_t I = 0; I < Permutation.size(); ++I) {
+        InversePermutation[Permutation[I]] = I;
+      }
+      SDValue OuterShuffle = DAG.getVectorShuffle(
+          VT, dl, NewShift, DAG.getUNDEF(VT), InversePermutation);
+      return OuterShuffle;
+    }
+  }
+
   // If possible, lower this packed shift into a vector multiply instead of
   // expanding it into a sequence of scalar shifts.
   // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts.