@@ -7809,6 +7809,92 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
7809
7809
DAG.getConstant(ARMCC::NE, dl, MVT::i32));
7810
7810
}
7811
7811
7812
+ static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
7813
+ ArrayRef<int> ShuffleMask,
7814
+ SelectionDAG &DAG) {
7815
+ // Attempt to lower the vector shuffle using as many whole register movs as
7816
+ // possible. This is useful for types smaller than 32bits, which would
7817
+ // often otherwise become a series for grp movs.
7818
+ SDLoc dl(Op);
7819
+ EVT VT = Op.getValueType();
7820
+ if (VT.getScalarSizeInBits() >= 32)
7821
+ return SDValue();
7822
+
7823
+ assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
7824
+ "Unexpected vector type");
7825
+ int NumElts = VT.getVectorNumElements();
7826
+ int QuarterSize = NumElts / 4;
7827
+ // The four final parts of the vector, as i32's
7828
+ SDValue Parts[4];
7829
+
7830
+ // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
7831
+ // <u,u,u,u>), returning the vmov lane index
7832
+ auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
7833
+ // Detect which mov lane this would be from the first non-undef element.
7834
+ int MovIdx = -1;
7835
+ for (int i = 0; i < Length; i++) {
7836
+ if (ShuffleMask[Start + i] >= 0) {
7837
+ if (ShuffleMask[Start + i] % Length != i)
7838
+ return -1;
7839
+ MovIdx = ShuffleMask[Start + i] / Length;
7840
+ break;
7841
+ }
7842
+ }
7843
+ // If all items are undef, leave this for other combines
7844
+ if (MovIdx == -1)
7845
+ return -1;
7846
+ // Check the remaining values are the correct part of the same mov
7847
+ for (int i = 1; i < Length; i++) {
7848
+ if (ShuffleMask[Start + i] >= 0 &&
7849
+ (ShuffleMask[Start + i] / Length != MovIdx ||
7850
+ ShuffleMask[Start + i] % Length != i))
7851
+ return -1;
7852
+ }
7853
+ return MovIdx;
7854
+ };
7855
+
7856
+ for (int Part = 0; Part < 4; ++Part) {
7857
+ // Does this part look like a mov
7858
+ int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
7859
+ if (Elt != -1) {
7860
+ SDValue Input = Op->getOperand(0);
7861
+ if (Elt >= 4) {
7862
+ Input = Op->getOperand(1);
7863
+ Elt -= 4;
7864
+ }
7865
+ SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input);
7866
+ Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast,
7867
+ DAG.getConstant(Elt, dl, MVT::i32));
7868
+ }
7869
+ }
7870
+
7871
+ // Nothing interesting found, just return
7872
+ if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
7873
+ return SDValue();
7874
+
7875
+ // The other parts need to be built with the old shuffle vector, cast to a
7876
+ // v4i32 and extract_vector_elts
7877
+ if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
7878
+ SmallVector<int, 16> NewShuffleMask;
7879
+ for (int Part = 0; Part < 4; ++Part)
7880
+ for (int i = 0; i < QuarterSize; i++)
7881
+ NewShuffleMask.push_back(
7882
+ Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
7883
+ SDValue NewShuffle = DAG.getVectorShuffle(
7884
+ VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
7885
+ SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle);
7886
+
7887
+ for (int Part = 0; Part < 4; ++Part)
7888
+ if (!Parts[Part])
7889
+ Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7890
+ BitCast, DAG.getConstant(Part, dl, MVT::i32));
7891
+ }
7892
+ // Build a vector out of the various parts and bitcast it back to the original
7893
+ // type.
7894
+ SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts);
7895
+ return DAG.getBitcast(VT, NewVec);
7896
+ }
7897
+
7812
7898
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
7813
7899
const ARMSubtarget *ST) {
7814
7900
SDValue V1 = Op.getOperand(0);
@@ -8003,6 +8089,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
8003
8089
if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8004
8090
return NewOp;
8005
8091
8092
+ if (ST->hasMVEIntegerOps())
8093
+ if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8094
+ return NewOp;
8095
+
8006
8096
return SDValue();
8007
8097
}
8008
8098
0 commit comments