Skip to content

Commit 792fab3

Browse files
committed
[ARM] Attempt to use whole register vmovs for MVE shuffles.
MVE doesn't have the range of shuffle instructions available in Neon. We also cannot use the trick of cutting a difficult vector shuffle in half to simplify things. Instead we need to be more careful about how we lower shuffles. This patch adds an extra combine that attempts to find "whole lane" vmovs when lowering shuffles of smaller types. This helps us make some shuffles a lot simpler, generating single lane movs for the parts that can make use of it, falling back to the original shuffle for the rest. Differential Revision: https://reviews.llvm.org/D69509
1 parent 3a6eb5f commit 792fab3

File tree

8 files changed

+1590
-1828
lines changed

8 files changed

+1590
-1828
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7809,6 +7809,92 @@ static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG,
78097809
DAG.getConstant(ARMCC::NE, dl, MVT::i32));
78107810
}
78117811

7812+
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op,
7813+
ArrayRef<int> ShuffleMask,
7814+
SelectionDAG &DAG) {
7815+
// Attempt to lower the vector shuffle using as many whole register movs as
7816+
// possible. This is useful for types smaller than 32bits, which would
7817+
// often otherwise become a series for grp movs.
7818+
SDLoc dl(Op);
7819+
EVT VT = Op.getValueType();
7820+
if (VT.getScalarSizeInBits() >= 32)
7821+
return SDValue();
7822+
7823+
assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
7824+
"Unexpected vector type");
7825+
int NumElts = VT.getVectorNumElements();
7826+
int QuarterSize = NumElts / 4;
7827+
// The four final parts of the vector, as i32's
7828+
SDValue Parts[4];
7829+
7830+
// Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
7831+
// <u,u,u,u>), returning the vmov lane index
7832+
auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
7833+
// Detect which mov lane this would be from the first non-undef element.
7834+
int MovIdx = -1;
7835+
for (int i = 0; i < Length; i++) {
7836+
if (ShuffleMask[Start + i] >= 0) {
7837+
if (ShuffleMask[Start + i] % Length != i)
7838+
return -1;
7839+
MovIdx = ShuffleMask[Start + i] / Length;
7840+
break;
7841+
}
7842+
}
7843+
// If all items are undef, leave this for other combines
7844+
if (MovIdx == -1)
7845+
return -1;
7846+
// Check the remaining values are the correct part of the same mov
7847+
for (int i = 1; i < Length; i++) {
7848+
if (ShuffleMask[Start + i] >= 0 &&
7849+
(ShuffleMask[Start + i] / Length != MovIdx ||
7850+
ShuffleMask[Start + i] % Length != i))
7851+
return -1;
7852+
}
7853+
return MovIdx;
7854+
};
7855+
7856+
for (int Part = 0; Part < 4; ++Part) {
7857+
// Does this part look like a mov
7858+
int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
7859+
if (Elt != -1) {
7860+
SDValue Input = Op->getOperand(0);
7861+
if (Elt >= 4) {
7862+
Input = Op->getOperand(1);
7863+
Elt -= 4;
7864+
}
7865+
SDValue BitCast = DAG.getBitcast(MVT::v4i32, Input);
7866+
Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, BitCast,
7867+
DAG.getConstant(Elt, dl, MVT::i32));
7868+
}
7869+
}
7870+
7871+
// Nothing interesting found, just return
7872+
if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
7873+
return SDValue();
7874+
7875+
// The other parts need to be built with the old shuffle vector, cast to a
7876+
// v4i32 and extract_vector_elts
7877+
if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
7878+
SmallVector<int, 16> NewShuffleMask;
7879+
for (int Part = 0; Part < 4; ++Part)
7880+
for (int i = 0; i < QuarterSize; i++)
7881+
NewShuffleMask.push_back(
7882+
Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
7883+
SDValue NewShuffle = DAG.getVectorShuffle(
7884+
VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
7885+
SDValue BitCast = DAG.getBitcast(MVT::v4i32, NewShuffle);
7886+
7887+
for (int Part = 0; Part < 4; ++Part)
7888+
if (!Parts[Part])
7889+
Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
7890+
BitCast, DAG.getConstant(Part, dl, MVT::i32));
7891+
}
7892+
// Build a vector out of the various parts and bitcast it back to the original
7893+
// type.
7894+
SDValue NewVec = DAG.getBuildVector(MVT::v4i32, dl, Parts);
7895+
return DAG.getBitcast(VT, NewVec);
7896+
}
7897+
78127898
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
78137899
const ARMSubtarget *ST) {
78147900
SDValue V1 = Op.getOperand(0);
@@ -8003,6 +8089,10 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
80038089
if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
80048090
return NewOp;
80058091

8092+
if (ST->hasMVEIntegerOps())
8093+
if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8094+
return NewOp;
8095+
80068096
return SDValue();
80078097
}
80088098

llvm/test/CodeGen/Thumb2/mve-pred-shuffle.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,9 +260,8 @@ define <8 x i16> @shuffle4_v8i16(<8 x i16> %src, <8 x i16> %a, <8 x i16> %b) {
260260
; CHECK-NEXT: vpsel q0, q1, q0
261261
; CHECK-NEXT: vmov.u16 r0, q0[0]
262262
; CHECK-NEXT: vdup.16 q1, r0
263-
; CHECK-NEXT: vmov.u16 r0, q0[1]
264-
; CHECK-NEXT: vmov.16 q1[7], r0
265263
; CHECK-NEXT: add r0, sp, #16
264+
; CHECK-NEXT: vmov.f32 s7, s0
266265
; CHECK-NEXT: vldrw.u32 q0, [r0]
267266
; CHECK-NEXT: mov r0, sp
268267
; CHECK-NEXT: vcmp.i16 ne, q1, zr

llvm/test/CodeGen/Thumb2/mve-shuffle.ll

Lines changed: 7 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,7 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
9898
; CHECK-LABEL: shuffle3_i16:
9999
; CHECK: @ %bb.0: @ %entry
100100
; CHECK-NEXT: vmov q1, q0
101-
; CHECK-NEXT: vmov.u16 r0, q0[4]
102-
; CHECK-NEXT: vmov.16 q0[0], r0
103-
; CHECK-NEXT: vmov.u16 r0, q1[5]
104-
; CHECK-NEXT: vmov.16 q0[1], r0
105-
; CHECK-NEXT: vmov.u16 r0, q1[7]
101+
; CHECK-NEXT: vmov.u16 r0, q0[7]
106102
; CHECK-NEXT: vmov.16 q0[2], r0
107103
; CHECK-NEXT: vmov.u16 r0, q1[6]
108104
; CHECK-NEXT: vmov.16 q0[3], r0
@@ -114,6 +110,7 @@ define arm_aapcs_vfpcc <8 x i16> @shuffle3_i16(<8 x i16> %src) {
114110
; CHECK-NEXT: vmov.16 q0[6], r0
115111
; CHECK-NEXT: vmov.u16 r0, q1[0]
116112
; CHECK-NEXT: vmov.16 q0[7], r0
113+
; CHECK-NEXT: vmov.f32 s0, s6
117114
; CHECK-NEXT: bx lr
118115
entry:
119116
%out = shufflevector <8 x i16> %src, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 7, i32 6, i32 3, i32 1, i32 2, i32 0>
@@ -387,16 +384,11 @@ entry:
387384
define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
388385
; CHECK-LABEL: shuffle3_f16:
389386
; CHECK: @ %bb.0: @ %entry
390-
; CHECK-NEXT: vmovx.f16 s4, s2
391-
; CHECK-NEXT: vmov r1, s2
392-
; CHECK-NEXT: vmov r0, s4
393-
; CHECK-NEXT: vmov.16 q1[0], r1
394-
; CHECK-NEXT: vmovx.f16 s8, s3
395-
; CHECK-NEXT: vmov.16 q1[1], r0
396-
; CHECK-NEXT: vmov r0, s8
397-
; CHECK-NEXT: vmovx.f16 s8, s1
398-
; CHECK-NEXT: vmov.16 q1[2], r0
387+
; CHECK-NEXT: vmovx.f16 s4, s3
399388
; CHECK-NEXT: vmov r0, s3
389+
; CHECK-NEXT: vmov r1, s4
390+
; CHECK-NEXT: vmovx.f16 s8, s1
391+
; CHECK-NEXT: vmov.16 q1[2], r1
400392
; CHECK-NEXT: vmov.16 q1[3], r0
401393
; CHECK-NEXT: vmov r0, s8
402394
; CHECK-NEXT: vmovx.f16 s8, s0
@@ -407,6 +399,7 @@ define arm_aapcs_vfpcc <8 x half> @shuffle3_f16(<8 x half> %src) {
407399
; CHECK-NEXT: vmov.16 q1[6], r0
408400
; CHECK-NEXT: vmov r0, s0
409401
; CHECK-NEXT: vmov.16 q1[7], r0
402+
; CHECK-NEXT: vmov.f32 s4, s2
410403
; CHECK-NEXT: vmov q0, q1
411404
; CHECK-NEXT: bx lr
412405
entry:

0 commit comments

Comments
 (0)