Skip to content

Commit 76bc3d6

Browse files
committed
[X86] In lowerVectorShuffle, instead of creating a new node to canonicalize the shuffle mask by commuting, just commute the mask and swap V1/V2.
LegalizeDAG tries to legal the DAG by legalizing nodes before their operands. If we create a new node, we end up legalizing it after its operands. This prevents some of the optimizations that can be done when the operand is a build_vector since the build_vector will have been legalized to something else. Differential Revision: https://reviews.llvm.org/D65132 llvm-svn: 366835
1 parent 374398a commit 76bc3d6

File tree

6 files changed

+38
-38
lines changed

6 files changed

+38
-38
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 15 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16650,7 +16650,7 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
1665016650
static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
1665116651
SelectionDAG &DAG) {
1665216652
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
16653-
ArrayRef<int> Mask = SVOp->getMask();
16653+
ArrayRef<int> OrigMask = SVOp->getMask();
1665416654
SDValue V1 = Op.getOperand(0);
1665516655
SDValue V2 = Op.getOperand(1);
1665616656
MVT VT = Op.getSimpleValueType();
@@ -16676,36 +16676,37 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
1667616676
// undef as well. This makes it easier to match the shuffle based solely on
1667716677
// the mask.
1667816678
if (V2IsUndef &&
16679-
any_of(Mask, [NumElements](int M) { return M >= NumElements; })) {
16680-
SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
16679+
any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) {
16680+
SmallVector<int, 8> NewMask(OrigMask.begin(), OrigMask.end());
1668116681
for (int &M : NewMask)
1668216682
if (M >= NumElements)
1668316683
M = -1;
1668416684
return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
1668516685
}
1668616686

1668716687
// Check for illegal shuffle mask element index values.
16688-
int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
16689-
assert(llvm::all_of(Mask,
16688+
int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2);
16689+
(void)MaskUpperLimit;
16690+
assert(llvm::all_of(OrigMask,
1669016691
[&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
1669116692
"Out of bounds shuffle index");
1669216693

1669316694
// We actually see shuffles that are entirely re-arrangements of a set of
1669416695
// zero inputs. This mostly happens while decomposing complex shuffles into
1669516696
// simple ones. Directly lower these as a buildvector of zeros.
16696-
APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
16697+
APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2);
1669716698
if (Zeroable.isAllOnesValue())
1669816699
return getZeroVector(VT, Subtarget, DAG, DL);
1669916700

1670016701
bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
1670116702

1670216703
// Create an alternative mask with info about zeroable elements.
1670316704
// Here we do not set undef elements as zeroable.
16704-
SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
16705+
SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end());
1670516706
if (V2IsZero) {
1670616707
assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
1670716708
for (int i = 0; i != NumElements; ++i)
16708-
if (Mask[i] != SM_SentinelUndef && Zeroable[i])
16709+
if (OrigMask[i] != SM_SentinelUndef && Zeroable[i])
1670916710
ZeroableMask[i] = SM_SentinelZero;
1671016711
}
1671116712

@@ -16720,7 +16721,7 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
1672016721
// by obfuscating the operands with bitcasts.
1672116722
// TODO: Avoid lowering directly from this top-level function: make this
1672216723
// a query (canLowerAsBroadcast) and defer lowering to the type-based calls.
16723-
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask,
16724+
if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask,
1672416725
Subtarget, DAG))
1672516726
return Broadcast;
1672616727

@@ -16756,8 +16757,11 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
1675616757
}
1675716758

1675816759
// Commute the shuffle if it will improve canonicalization.
16759-
if (canonicalizeShuffleMaskWithCommute(Mask))
16760-
return DAG.getCommutedVectorShuffle(*SVOp);
16760+
SmallVector<int, 64> Mask(OrigMask.begin(), OrigMask.end());
16761+
if (canonicalizeShuffleMaskWithCommute(Mask)) {
16762+
ShuffleVectorSDNode::commuteMask(Mask);
16763+
std::swap(V1, V2);
16764+
}
1676116765

1676216766
if (SDValue V = lowerShuffleWithVPMOV(DL, Mask, VT, V1, V2, DAG, Subtarget))
1676316767
return V;

llvm/test/CodeGen/X86/oddshuffles.ll

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,7 +1056,7 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
10561056
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3,4],xmm4[5,6,7]
10571057
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,7,8,9,2,3,8,9,14,15]
10581058
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1059-
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,14,15,0,1,2,3]
1059+
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
10601060
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm2[5,6,7]
10611061
; AVX1-NEXT: vmovdqu %xmm3, (%rsi)
10621062
; AVX1-NEXT: vmovdqu %xmm4, (%rdx)
@@ -1094,8 +1094,8 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
10941094
; XOP-NEXT: vpperm {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15],xmm2[4,5,10,11]
10951095
; XOP-NEXT: vpblendw {{.*#+}} xmm4 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
10961096
; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[2,3,8,9,14,15,4,5,10,11],xmm2[0,1,6,7,12,13]
1097-
; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11],xmm1[0,1,6,7,12,13,14,15,0,1,2,3]
1098-
; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,7,8,9],xmm2[2,3,8,9,14,15]
1097+
; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3,4],xmm0[5],xmm1[6,7]
1098+
; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5,10,11,0,1,6,7,12,13],xmm2[2,3,8,9,14,15]
10991099
; XOP-NEXT: vmovdqu %xmm3, (%rsi)
11001100
; XOP-NEXT: vmovdqu %xmm4, (%rdx)
11011101
; XOP-NEXT: vmovdqu %xmm0, (%rcx)
@@ -1187,7 +1187,7 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
11871187
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm3[1,1,2,2]
11881188
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0],xmm2[1,2],xmm4[3],xmm2[4,5],xmm4[6],xmm2[7]
11891189
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1190-
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
1190+
; AVX1-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
11911191
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm3[2,2,3,3]
11921192
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5,6],xmm5[7]
11931193
; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1233,9 +1233,8 @@ define void @interleave_24i16_in(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2,
12331233
; XOP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
12341234
; XOP-NEXT: vpperm {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm2[0,1],xmm4[4,5,6,7],xmm2[2,3],xmm4[8,9,10,11]
12351235
; XOP-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm3
1236-
; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm1[10,11],xmm0[12,13,12,13],xmm1[12,13,12,13],xmm0[14,15],xmm1[14,15],xmm0[14,15]
1237-
; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,2,3,3]
1238-
; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
1236+
; XOP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1237+
; XOP-NEXT: vpperm {{.*#+}} xmm0 = xmm0[4,5],xmm2[10,11],xmm0[10,11,8,9],xmm2[12,13],xmm0[14,15,12,13],xmm2[14,15]
12391238
; XOP-NEXT: vmovdqu %xmm0, 32(%rdi)
12401239
; XOP-NEXT: vmovups %ymm3, (%rdi)
12411240
; XOP-NEXT: vzeroupper

llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,8 +1323,8 @@ define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
13231323
; XOPAVX1: # %bb.0:
13241324
; XOPAVX1-NEXT: vpsllq $48, %xmm0, %xmm0
13251325
; XOPAVX1-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
1326-
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
13271326
; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
1327+
; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
13281328
; XOPAVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
13291329
; XOPAVX1-NEXT: vpsubq %xmm1, %xmm2, %xmm1
13301330
; XOPAVX1-NEXT: vpshaq %xmm1, %xmm0, %xmm0
@@ -1334,8 +1334,8 @@ define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
13341334
; XOPAVX2: # %bb.0:
13351335
; XOPAVX2-NEXT: vpsllq $48, %xmm0, %xmm0
13361336
; XOPAVX2-NEXT: vpshaq {{.*}}(%rip), %xmm0, %xmm0
1337-
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
13381337
; XOPAVX2-NEXT: vpbroadcastq %xmm1, %xmm1
1338+
; XOPAVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
13391339
; XOPAVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
13401340
; XOPAVX2-NEXT: vpsubq %xmm1, %xmm2, %xmm1
13411341
; XOPAVX2-NEXT: vpshaq %xmm1, %xmm0, %xmm0

llvm/test/CodeGen/X86/vector-shuffle-128-v2.ll

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -718,20 +718,23 @@ define <2 x i64> @shuffle_v2i64_z0(<2 x i64> %a) {
718718
define <2 x i64> @shuffle_v2i64_z1(<2 x i64> %a) {
719719
; SSE2-LABEL: shuffle_v2i64_z1:
720720
; SSE2: # %bb.0:
721-
; SSE2-NEXT: xorpd %xmm1, %xmm1
722-
; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
721+
; SSE2-NEXT: xorps %xmm1, %xmm1
722+
; SSE2-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
723+
; SSE2-NEXT: movaps %xmm1, %xmm0
723724
; SSE2-NEXT: retq
724725
;
725726
; SSE3-LABEL: shuffle_v2i64_z1:
726727
; SSE3: # %bb.0:
727-
; SSE3-NEXT: xorpd %xmm1, %xmm1
728-
; SSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
728+
; SSE3-NEXT: xorps %xmm1, %xmm1
729+
; SSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
730+
; SSE3-NEXT: movaps %xmm1, %xmm0
729731
; SSE3-NEXT: retq
730732
;
731733
; SSSE3-LABEL: shuffle_v2i64_z1:
732734
; SSSE3: # %bb.0:
733-
; SSSE3-NEXT: xorpd %xmm1, %xmm1
734-
; SSSE3-NEXT: movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
735+
; SSSE3-NEXT: xorps %xmm1, %xmm1
736+
; SSSE3-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
737+
; SSSE3-NEXT: movaps %xmm1, %xmm0
735738
; SSSE3-NEXT: retq
736739
;
737740
; SSE41-LABEL: shuffle_v2i64_z1:

llvm/test/CodeGen/X86/vector-zext-widen.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1999,19 +1999,16 @@ define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtabl
19991999
;
20002000
; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8:
20012001
; SSE41: # %bb.0: # %entry
2002-
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
20032002
; SSE41-NEXT: pxor %xmm2, %xmm2
2004-
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
20052003
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2006-
; SSE41-NEXT: movdqa %xmm2, %xmm1
2004+
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
20072005
; SSE41-NEXT: retq
20082006
;
20092007
; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8:
20102008
; AVX1: # %bb.0: # %entry
20112009
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2012-
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
2013-
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2014-
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
2010+
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2011+
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
20152012
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
20162013
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
20172014
; AVX1-NEXT: retq

llvm/test/CodeGen/X86/vector-zext.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2057,19 +2057,16 @@ define <8 x i32> @shuf_zext_16i16_to_8i32_offset8(<16 x i16> %A) nounwind uwtabl
20572057
;
20582058
; SSE41-LABEL: shuf_zext_16i16_to_8i32_offset8:
20592059
; SSE41: # %bb.0: # %entry
2060-
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,2,3,3]
20612060
; SSE41-NEXT: pxor %xmm2, %xmm2
2062-
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3],xmm0[4],xmm2[5,6,7]
20632061
; SSE41-NEXT: pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
2064-
; SSE41-NEXT: movdqa %xmm2, %xmm1
2062+
; SSE41-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
20652063
; SSE41-NEXT: retq
20662064
;
20672065
; AVX1-LABEL: shuf_zext_16i16_to_8i32_offset8:
20682066
; AVX1: # %bb.0: # %entry
20692067
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
2070-
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3]
2071-
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
2072-
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
2068+
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
2069+
; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
20732070
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
20742071
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
20752072
; AVX1-NEXT: retq

0 commit comments

Comments
 (0)