Skip to content

Commit 2492075

Browse files
committed
[X86][SSE] lowerShuffleAsBitRotate - lower to vXi8 shuffles to ROTL on pre-SSSE3 targets
Without PSHUFB we are better using ROTL (expanding to OR(SHL,SRL)) than using the generic v16i8 shuffle lowering - but if we can widen to v8i16 or more then the existing shuffles are still the better option. REAPPLIED: Original commit rG11c16e71598d was reverted at rGde1d90299b16 as it wasn't accounting for later lowering. This version emits ROTLI or the OR(VSHLI/VSRLI) directly to avoid the issue.
1 parent de1c287 commit 2492075

File tree

5 files changed

+157
-201
lines changed

5 files changed

+157
-201
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11704,7 +11704,7 @@ static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
1170411704
return RotateAmt;
1170511705
}
1170611706

11707-
/// Lower shuffle using ISD::ROTL rotations.
11707+
/// Lower shuffle using X86ISD::VROTLI rotations.
1170811708
static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
1170911709
ArrayRef<int> Mask,
1171011710
const X86Subtarget &Subtarget,
@@ -11716,25 +11716,46 @@ static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
1171611716
assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
1171711717

1171811718
// Only XOP + AVX512 targets have bit rotation instructions.
11719+
// If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
1171911720
bool IsLegal =
1172011721
(VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
11721-
if (!IsLegal)
11722+
if (!IsLegal && Subtarget.hasSSE3())
1172211723
return SDValue();
1172311724

1172411725
// AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
11725-
int MinSubElts = Subtarget.hasXOP() ? 2 : std::max(32 / EltSizeInBits, 2);
11726+
int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
1172611727
int MaxSubElts = 64 / EltSizeInBits;
1172711728
for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
1172811729
int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
1172911730
if (RotateAmt < 0)
1173011731
continue;
11731-
int RotateAmtInBits = RotateAmt * EltSizeInBits;
11732+
1173211733
int NumElts = VT.getVectorNumElements();
1173311734
MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
1173411735
MVT RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
11736+
11737+
// For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
11738+
// expanded to OR(SRL,SHL), will be more efficient, but if they can
11739+
// widen to vXi16 or more then existing lowering should will be better.
11740+
int RotateAmtInBits = RotateAmt * EltSizeInBits;
11741+
if (!IsLegal) {
11742+
if ((RotateAmtInBits % 16) == 0)
11743+
return SDValue();
11744+
// TODO: Use getTargetVShiftByConstNode.
11745+
unsigned ShlAmt = RotateAmtInBits;
11746+
unsigned SrlAmt = RotateSVT.getScalarSizeInBits() - RotateAmtInBits;
11747+
V1 = DAG.getBitcast(RotateVT, V1);
11748+
SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
11749+
DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
11750+
SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
11751+
DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
11752+
SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
11753+
return DAG.getBitcast(VT, Rot);
11754+
}
11755+
1173511756
SDValue Rot =
11736-
DAG.getNode(ISD::ROTL, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11737-
DAG.getConstant(RotateAmtInBits, DL, RotateVT));
11757+
DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
11758+
DAG.getTargetConstant(RotateAmtInBits, DL, MVT::i8));
1173811759
return DAG.getBitcast(VT, Rot);
1173911760
}
1174011761

llvm/test/CodeGen/X86/bitreverse.ll

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,10 @@ define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
5252
;
5353
; X64-LABEL: test_bitreverse_v2i16:
5454
; X64: # %bb.0:
55-
; X64-NEXT: pxor %xmm1, %xmm1
56-
; X64-NEXT: movdqa %xmm0, %xmm2
57-
; X64-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
58-
; X64-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
59-
; X64-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
60-
; X64-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
61-
; X64-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
62-
; X64-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
63-
; X64-NEXT: packuswb %xmm2, %xmm0
55+
; X64-NEXT: movdqa %xmm0, %xmm1
56+
; X64-NEXT: psrlw $8, %xmm1
57+
; X64-NEXT: psllw $8, %xmm0
58+
; X64-NEXT: por %xmm1, %xmm0
6459
; X64-NEXT: movdqa %xmm0, %xmm1
6560
; X64-NEXT: psllw $4, %xmm1
6661
; X64-NEXT: pand {{.*}}(%rip), %xmm1

llvm/test/CodeGen/X86/bswap-vector.ll

Lines changed: 16 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,10 @@ declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
1111
define <8 x i16> @test1(<8 x i16> %v) {
1212
; CHECK-NOSSSE3-LABEL: test1:
1313
; CHECK-NOSSSE3: # %bb.0: # %entry
14-
; CHECK-NOSSSE3-NEXT: pxor %xmm1, %xmm1
15-
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
16-
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
17-
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
18-
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
19-
; CHECK-NOSSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
20-
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
21-
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
22-
; CHECK-NOSSSE3-NEXT: packuswb %xmm2, %xmm0
14+
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm1
15+
; CHECK-NOSSSE3-NEXT: psrlw $8, %xmm1
16+
; CHECK-NOSSSE3-NEXT: psllw $8, %xmm0
17+
; CHECK-NOSSSE3-NEXT: por %xmm1, %xmm0
2318
; CHECK-NOSSSE3-NEXT: ret{{[l|q]}}
2419
;
2520
; CHECK-SSSE3-LABEL: test1:
@@ -132,23 +127,14 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
132127
define <16 x i16> @test4(<16 x i16> %v) {
133128
; CHECK-NOSSSE3-LABEL: test4:
134129
; CHECK-NOSSSE3: # %bb.0: # %entry
135-
; CHECK-NOSSSE3-NEXT: pxor %xmm2, %xmm2
136-
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm3
137-
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
138-
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
139-
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
140-
; CHECK-NOSSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
141-
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
142-
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
143-
; CHECK-NOSSSE3-NEXT: packuswb %xmm3, %xmm0
144-
; CHECK-NOSSSE3-NEXT: movdqa %xmm1, %xmm3
145-
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
146-
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
147-
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,4,7,6]
148-
; CHECK-NOSSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
149-
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
150-
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,7,6]
151-
; CHECK-NOSSSE3-NEXT: packuswb %xmm3, %xmm1
130+
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
131+
; CHECK-NOSSSE3-NEXT: psrlw $8, %xmm2
132+
; CHECK-NOSSSE3-NEXT: psllw $8, %xmm0
133+
; CHECK-NOSSSE3-NEXT: por %xmm2, %xmm0
134+
; CHECK-NOSSSE3-NEXT: movdqa %xmm1, %xmm2
135+
; CHECK-NOSSSE3-NEXT: psrlw $8, %xmm2
136+
; CHECK-NOSSSE3-NEXT: psllw $8, %xmm1
137+
; CHECK-NOSSSE3-NEXT: por %xmm2, %xmm1
152138
; CHECK-NOSSSE3-NEXT: ret{{[l|q]}}
153139
;
154140
; CHECK-SSSE3-LABEL: test4:
@@ -252,15 +238,10 @@ declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
252238
define <4 x i16> @test7(<4 x i16> %v) {
253239
; CHECK-NOSSSE3-LABEL: test7:
254240
; CHECK-NOSSSE3: # %bb.0: # %entry
255-
; CHECK-NOSSSE3-NEXT: pxor %xmm1, %xmm1
256-
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm2
257-
; CHECK-NOSSSE3-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
258-
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
259-
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,4,7,6]
260-
; CHECK-NOSSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
261-
; CHECK-NOSSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
262-
; CHECK-NOSSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,4,7,6]
263-
; CHECK-NOSSSE3-NEXT: packuswb %xmm2, %xmm0
241+
; CHECK-NOSSSE3-NEXT: movdqa %xmm0, %xmm1
242+
; CHECK-NOSSSE3-NEXT: psrlw $8, %xmm1
243+
; CHECK-NOSSSE3-NEXT: psllw $8, %xmm0
244+
; CHECK-NOSSSE3-NEXT: por %xmm1, %xmm0
264245
; CHECK-NOSSSE3-NEXT: ret{{[l|q]}}
265246
;
266247
; CHECK-SSSE3-LABEL: test7:

0 commit comments

Comments
 (0)