@@ -4926,22 +4926,69 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
4926
4926
return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
4927
4927
}
4928
4928
4929
- // Detect shuffles which can be re-expressed as vector selects; these are
4930
- // shuffles in which each element in the destination is taken from an element
4931
- // at the corresponding index in either source vectors.
4932
- bool IsSelect = all_of(enumerate(Mask), [&](const auto &MaskIdx) {
4933
- int MaskIndex = MaskIdx.value();
4934
- return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts;
4935
- });
4936
4929
4930
+ // Handle any remaining single source shuffles
4937
4931
assert(!V1.isUndef() && "Unexpected shuffle canonicalization");
4932
+ if (V2.isUndef()) {
4933
+ // We might be able to express the shuffle as a bitrotate. But even if we
4934
+ // don't have Zvkb and have to expand, the expanded sequence of approx. 2
4935
+ // shifts and a vor will have a higher throughput than a vrgather.
4936
+ if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
4937
+ return V;
4938
+
4939
+ // Base case for the two operand recursion below - handle the worst case
4940
+ // single source shuffle.
4941
+ unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
4942
+ MVT IndexVT = VT.changeTypeToInteger();
4943
+ // Since we can't introduce illegal index types at this stage, use i16 and
4944
+ // vrgatherei16 if the corresponding index type for plain vrgather is greater
4945
+ // than XLenVT.
4946
+ if (IndexVT.getScalarType().bitsGT(XLenVT)) {
4947
+ GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
4948
+ IndexVT = IndexVT.changeVectorElementType(MVT::i16);
4949
+ }
4950
+
4951
+ // If the mask allows, we can do all the index computation in 16 bits. This
4952
+ // requires less work and less register pressure at high LMUL, and creates
4953
+ // smaller constants which may be cheaper to materialize.
4954
+ if (IndexVT.getScalarType().bitsGT(MVT::i16) && isUInt<16>(NumElts - 1) &&
4955
+ (IndexVT.getSizeInBits() / Subtarget.getRealMinVLen()) > 1) {
4956
+ GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
4957
+ IndexVT = IndexVT.changeVectorElementType(MVT::i16);
4958
+ }
4959
+
4960
+ MVT IndexContainerVT =
4961
+ ContainerVT.changeVectorElementType(IndexVT.getScalarType());
4962
+
4963
+ V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
4964
+ SmallVector<SDValue> GatherIndicesLHS;
4965
+ for (int MaskIndex : Mask) {
4966
+ bool IsLHSIndex = MaskIndex < (int)NumElts && MaskIndex >= 0;
4967
+ GatherIndicesLHS.push_back(IsLHSIndex
4968
+ ? DAG.getConstant(MaskIndex, DL, XLenVT)
4969
+ : DAG.getUNDEF(XLenVT));
4970
+ }
4971
+ SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
4972
+ LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
4973
+ Subtarget);
4974
+ SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
4975
+ DAG.getUNDEF(ContainerVT), TrueMask, VL);
4976
+ return convertFromScalableVector(VT, Gather, DAG, Subtarget);
4977
+ }
4938
4978
4939
4979
// By default we preserve the original operand order, and use a mask to
4940
4980
// select LHS as true and RHS as false. However, since RVV vector selects may
4941
4981
// feature splats but only on the LHS, we may choose to invert our mask and
4942
4982
// instead select between RHS and LHS.
4943
4983
bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
4944
4984
4985
+ // Detect shuffles which can be re-expressed as vector selects; these are
4986
+ // shuffles in which each element in the destination is taken from an element
4987
+ // at the corresponding index in either source vectors.
4988
+ bool IsSelect = all_of(enumerate(Mask), [&](const auto &MaskIdx) {
4989
+ int MaskIndex = MaskIdx.value();
4990
+ return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts;
4991
+ });
4945
4992
if (IsSelect) {
4946
4993
// Now construct the mask that will be used by the vselect operation.
4947
4994
SmallVector<SDValue> MaskVals;
@@ -4959,12 +5006,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
4959
5006
return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2);
4960
5007
}
4961
5008
4962
- // We might be able to express the shuffle as a bitrotate. But even if we
4963
- // don't have Zvkb and have to expand, the expanded sequence of approx. 2
4964
- // shifts and a vor will have a higher throughput than a vrgather.
4965
- if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
4966
- return V;
4967
-
4968
5009
if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) {
4969
5010
// On such a large vector we're unable to use i8 as the index type.
4970
5011
// FIXME: We could promote the index to i16 and use vrgatherei16, but that
@@ -4998,46 +5039,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
4998
5039
MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
4999
5040
SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
5000
5041
5001
- // Base case for the recursion just below - handle the worst case
5002
- // single source permutation. Note that all the splat variants
5003
- // are handled above.
5004
- if (V2.isUndef()) {
5005
- unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
5006
- MVT IndexVT = VT.changeTypeToInteger();
5007
- // Since we can't introduce illegal index types at this stage, use i16 and
5008
- // vrgatherei16 if the corresponding index type for plain vrgather is greater
5009
- // than XLenVT.
5010
- if (IndexVT.getScalarType().bitsGT(XLenVT)) {
5011
- GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
5012
- IndexVT = IndexVT.changeVectorElementType(MVT::i16);
5013
- }
5014
-
5015
- // If the mask allows, we can do all the index computation in 16 bits. This
5016
- // requires less work and less register pressure at high LMUL, and creates
5017
- // smaller constants which may be cheaper to materialize.
5018
- if (IndexVT.getScalarType().bitsGT(MVT::i16) && isUInt<16>(NumElts - 1) &&
5019
- (IndexVT.getSizeInBits() / Subtarget.getRealMinVLen()) > 1) {
5020
- GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
5021
- IndexVT = IndexVT.changeVectorElementType(MVT::i16);
5022
- }
5023
-
5024
- MVT IndexContainerVT =
5025
- ContainerVT.changeVectorElementType(IndexVT.getScalarType());
5026
-
5027
- V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
5028
- SmallVector<SDValue> GatherIndicesLHS;
5029
- for (int ShuffleIdx : ShuffleMaskLHS)
5030
- GatherIndicesLHS.push_back(ShuffleIdx != -1
5031
- ? DAG.getConstant(ShuffleIdx, DL, XLenVT)
5032
- : DAG.getUNDEF(XLenVT));
5033
- SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
5034
- LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
5035
- Subtarget);
5036
- SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
5037
- DAG.getUNDEF(ContainerVT), TrueMask, VL);
5038
- return convertFromScalableVector(VT, Gather, DAG, Subtarget);
5039
- }
5040
-
5041
5042
// Recursively invoke lowering for each operand if we had two
5042
5043
// independent single source shuffles, and then combine the result via a
5043
5044
// vselect. Note that the vselect will likely be folded back into the
0 commit comments