Skip to content

Commit d0d48a9

Browse files
committed
[X86] Lower vector interleave into unpck and perm
[This Godbolt link](https://godbolt.org/z/s17Kv1s9T) shows different codegen between clang and gcc for a transpose operation. clang result: ``` vmovdqu xmm0, xmmword ptr [rcx + rax] vmovdqu xmm1, xmmword ptr [rcx + rax + 16] vmovdqu xmm2, xmmword ptr [r8 + rax] vmovdqu xmm3, xmmword ptr [r8 + rax + 16] vpunpckhbw xmm4, xmm2, xmm0 vpunpcklbw xmm0, xmm2, xmm0 vpunpcklbw xmm2, xmm3, xmm1 vpunpckhbw xmm1, xmm3, xmm1 vmovdqu xmmword ptr [rdi + 2*rax + 48], xmm1 vmovdqu xmmword ptr [rdi + 2*rax + 32], xmm2 vmovdqu xmmword ptr [rdi + 2*rax], xmm0 vmovdqu xmmword ptr [rdi + 2*rax + 16], xmm4 ``` gcc result: ``` vmovdqu ymm3, YMMWORD PTR [rdi+rax] vpunpcklbw ymm1, ymm3, YMMWORD PTR [rsi+rax] vpunpckhbw ymm0, ymm3, YMMWORD PTR [rsi+rax] vperm2i128 ymm2, ymm1, ymm0, 32 vperm2i128 ymm1, ymm1, ymm0, 49 vmovdqu YMMWORD PTR [rcx+rax*2], ymm2 vmovdqu YMMWORD PTR [rcx+32+rax*2], ymm1 ``` clang's code is roughly 15% slower than gcc's when evaluated on an internal compression benchmark. The loop vectorizer generates the following shufflevector intrinsic: ``` %interleaved.vec = shufflevector <32 x i8> %a, <32 x i8> %b, <64 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63> ``` which is lowered to SelectionDAG: ``` t2: v32i8,ch = CopyFromReg t0, Register:v32i8 %0 t6: v64i8 = concat_vectors t2, undef:v32i8 t4: v32i8,ch = CopyFromReg t0, Register:v32i8 %1 t7: v64i8 = concat_vectors t4, undef:v32i8 t8: v64i8 = vector_shuffle<0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79,16,80,17,81,18,82,19,83,20,84,21,85,22,86,23,87,24,88,25,89,26,90,27,91,28,92,29,93,30,94,31,95> t6, t7 ``` So far this `vector_shuffle` is good enough for us to pattern-match and transform, but as we go down the SelectionDAG pipeline, it got split into smaller shuffles. During dagcombine1, the shuffle is split by `foldShuffleOfConcatUndefs`. ``` // shuffle (concat X, undef), (concat Y, undef), Mask --> // concat (shuffle X, Y, Mask0), (shuffle X, Y, Mask1) t2: v32i8,ch = CopyFromReg t0, Register:v32i8 %0 t4: v32i8,ch = CopyFromReg t0, Register:v32i8 %1 t19: v32i8 = vector_shuffle<0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47> t2, t4 t15: ch,glue = CopyToReg t0, Register:v32i8 $ymm0, t19 t20: v32i8 = vector_shuffle<16,48,17,49,18,50,19,51,20,52,21,53,22,54,23,55,24,56,25,57,26,58,27,59,28,60,29,61,30,62,31,63> t2, t4 t17: ch,glue = CopyToReg t15, Register:v32i8 $ymm1, t20, t15:1 ``` With `foldShuffleOfConcatUndefs` commented out, the vector is still split later by the type legalizer, which comes after dagcombine1, because v64i8 is not a legal type in AVX2 (64 * 8 = 512 bits while ymm = 256 bits). There doesn't seem to be a good way to avoid this split. Lowering the `vector_shuffle` into unpck and perm during dagcombine1 is too early. Therefore, although somewhat inconvenient, we decided to go with pattern-matching a pair vector shuffles later in the SelectionDAG pipeline, as part of `lowerV32I8Shuffle`. The code looks at the two operands of the first shuffle it encounters, iterates through the users of the operands, and tries to find two shuffles that are consecutive interleaves. Once the pattern is found, it lowers them into unpcks and perms. It returns the perm for the shuffle that's currently being lowered (have ISel modify the DAG), and replaces the other shuffle in place. Reviewed By: RKSimon Differential Revision: https://reviews.llvm.org/D134477
1 parent 4467c78 commit d0d48a9

File tree

6 files changed

+335
-233
lines changed

6 files changed

+335
-233
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17775,6 +17775,90 @@ static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT,
1777517775
DAG.getIntPtrConstant(0, DL));
1777617776
}
1777717777

17778+
// a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2
17779+
// b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2
17780+
// =>
17781+
// ul = unpckl v1, v2
17782+
// uh = unpckh v1, v2
17783+
// a = vperm ul, uh
17784+
// b = vperm ul, uh
17785+
//
17786+
// Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck
17787+
// and permute. We cannot directly match v3 because it is split into two
17788+
// 256-bit vectors in earlier isel stages. Therefore, this function matches a
17789+
// pair of 256-bit shuffles and makes sure the masks are consecutive.
17790+
//
17791+
// Once unpck and permute nodes are created, the permute corresponding to this
17792+
// shuffle is returned, while the other permute replaces the other half of the
17793+
// shuffle in the selection dag.
17794+
static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT,
17795+
SDValue V1, SDValue V2,
17796+
ArrayRef<int> Mask,
17797+
SelectionDAG &DAG) {
17798+
if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 &&
17799+
VT != MVT::v32i8)
17800+
return SDValue();
17801+
// <B0, B1, B0+1, B1+1, ..., >
17802+
auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0,
17803+
unsigned Begin1) {
17804+
size_t Size = Mask.size();
17805+
assert(Size % 2 == 0 && "Expected even mask size");
17806+
for (unsigned I = 0; I < Size; I += 2) {
17807+
if (Mask[I] != (int)(Begin0 + I / 2) ||
17808+
Mask[I + 1] != (int)(Begin1 + I / 2))
17809+
return false;
17810+
}
17811+
return true;
17812+
};
17813+
// Check which half is this shuffle node
17814+
int NumElts = VT.getVectorNumElements();
17815+
size_t FirstQtr = NumElts / 2;
17816+
size_t ThirdQtr = NumElts + NumElts / 2;
17817+
bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts);
17818+
bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr);
17819+
if (!IsFirstHalf && !IsSecondHalf)
17820+
return SDValue();
17821+
17822+
// Find the intersection between shuffle users of V1 and V2.
17823+
SmallVector<SDNode *, 2> Shuffles;
17824+
for (SDNode *User : V1->uses())
17825+
if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 &&
17826+
User->getOperand(1) == V2)
17827+
Shuffles.push_back(User);
17828+
// Limit user size to two for now.
17829+
if (Shuffles.size() != 2)
17830+
return SDValue();
17831+
// Find out which half of the 512-bit shuffles is each smaller shuffle
17832+
auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]);
17833+
auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]);
17834+
SDNode *FirstHalf;
17835+
SDNode *SecondHalf;
17836+
if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) &&
17837+
IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) {
17838+
FirstHalf = Shuffles[0];
17839+
SecondHalf = Shuffles[1];
17840+
} else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) &&
17841+
IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) {
17842+
FirstHalf = Shuffles[1];
17843+
SecondHalf = Shuffles[0];
17844+
} else {
17845+
return SDValue();
17846+
}
17847+
// Lower into unpck and perm. Return the perm of this shuffle and replace
17848+
// the other.
17849+
SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
17850+
SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
17851+
SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
17852+
DAG.getTargetConstant(0x20, DL, MVT::i8));
17853+
SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh,
17854+
DAG.getTargetConstant(0x31, DL, MVT::i8));
17855+
if (IsFirstHalf) {
17856+
DAG.ReplaceAllUsesWith(SecondHalf, &Perm2);
17857+
return Perm1;
17858+
}
17859+
DAG.ReplaceAllUsesWith(FirstHalf, &Perm1);
17860+
return Perm2;
17861+
}
1777817862

1777917863
/// Handle lowering of 4-lane 64-bit floating point shuffles.
1778017864
///
@@ -18082,6 +18166,16 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1808218166
DAG, Subtarget))
1808318167
return V;
1808418168

18169+
// Try to match an interleave of two v8f32s and lower them as unpck and
18170+
// permutes using ymms. This needs to go before we try to split the vectors.
18171+
//
18172+
// TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits
18173+
// this path inadvertently.
18174+
if (Subtarget.hasAVX2() && !Subtarget.hasAVX512())
18175+
if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2,
18176+
Mask, DAG))
18177+
return V;
18178+
1808518179
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
1808618180
// since after split we get a more efficient code using vpunpcklwd and
1808718181
// vpunpckhwd instrs than vblend.
@@ -18120,6 +18214,13 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1812018214
Zeroable, Subtarget, DAG))
1812118215
return ZExt;
1812218216

18217+
// Try to match an interleave of two v8i32s and lower them as unpck and
18218+
// permutes using ymms. This needs to go before we try to split the vectors.
18219+
if (!Subtarget.hasAVX512())
18220+
if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2,
18221+
Mask, DAG))
18222+
return V;
18223+
1812318224
// For non-AVX512 if the Mask is of 16bit elements in lane then try to split
1812418225
// since after split we get a more efficient code than vblend by using
1812518226
// vpunpcklwd and vpunpckhwd instrs.
@@ -18325,6 +18426,13 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1832518426
DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget))
1832618427
return V;
1832718428

18429+
// Try to match an interleave of two v16i16s and lower them as unpck and
18430+
// permutes using ymms.
18431+
if (!Subtarget.hasAVX512())
18432+
if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2,
18433+
Mask, DAG))
18434+
return V;
18435+
1832818436
// Otherwise fall back on generic lowering.
1832918437
return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask,
1833018438
Subtarget, DAG);
@@ -18438,6 +18546,13 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1843818546
Mask, Zeroable, DAG))
1843918547
return V;
1844018548

18549+
// Try to match an interleave of two v32i8s and lower them as unpck and
18550+
// permutes using ymms.
18551+
if (!Subtarget.hasAVX512())
18552+
if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2,
18553+
Mask, DAG))
18554+
return V;
18555+
1844118556
// Otherwise fall back on generic lowering.
1844218557
return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask,
1844318558
Subtarget, DAG);

llvm/test/CodeGen/X86/slow-pmulld.ll

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -492,15 +492,11 @@ define <16 x i32> @test_mul_v16i32_v16i16(<16 x i16> %A) {
492492
; AVX2-SLOW: # %bb.0:
493493
; AVX2-SLOW-NEXT: vmovdqa {{.*#+}} ymm1 = [18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778,18778]
494494
; AVX2-SLOW-NEXT: vpmulhuw %ymm1, %ymm0, %ymm2
495-
; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm1
496-
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
497-
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
498-
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm0, %ymm3, %ymm0
499-
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm2, %xmm2
500-
; AVX2-SLOW-NEXT: vextracti128 $1, %ymm1, %xmm1
501-
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
502-
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
503-
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
495+
; AVX2-SLOW-NEXT: vpmullw %ymm1, %ymm0, %ymm0
496+
; AVX2-SLOW-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
497+
; AVX2-SLOW-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
498+
; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm2[0,1],ymm1[0,1]
499+
; AVX2-SLOW-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm2[2,3],ymm1[2,3]
504500
; AVX2-SLOW-NEXT: ret{{[l|q]}}
505501
;
506502
; AVX2-32-LABEL: test_mul_v16i32_v16i16:

0 commit comments

Comments
 (0)