Skip to content

Commit 6ba5fc2

Browse files
committed
[X86] lowerShuffleWithVPMOV - support direct lowering to VPMOV on VLX targets
lowerShuffleWithVPMOV currently only matches shuffle(truncate(x)) patterns, but on VLX targets the truncate isn't usually necessary to make the VPMOV node worthwhile (as we're only targetting v16i8/v8i16 shuffles we're almost always ending up with a PSHUFB node instead). PACKSS/PACKUS are still preferred vs VPMOV due to their lower uop count. Fixes the remaining regression from the fixes in rG293899c64b75
1 parent dd4c838 commit 6ba5fc2

File tree

6 files changed

+45
-29
lines changed

6 files changed

+45
-29
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12406,22 +12406,33 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
1240612406
unsigned EltSizeInBits = VT.getScalarSizeInBits();
1240712407
unsigned MaxScale = 64 / EltSizeInBits;
1240812408
for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
12409+
unsigned SrcEltBits = EltSizeInBits * Scale;
1240912410
unsigned NumSrcElts = NumElts / Scale;
1241012411
unsigned UpperElts = NumElts - NumSrcElts;
1241112412
if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
1241212413
!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
1241312414
continue;
1241412415

12416+
// Attempt to find a matching source truncation, but as a fall back VLX
12417+
// cases can use the VPMOV directly.
1241512418
SDValue Src = peekThroughBitcasts(V1);
12416-
if (Src.getOpcode() != ISD::TRUNCATE ||
12417-
Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
12419+
if (Src.getOpcode() == ISD::TRUNCATE &&
12420+
Src.getScalarValueSizeInBits() == SrcEltBits) {
12421+
Src = Src.getOperand(0);
12422+
} else if (Subtarget.hasVLX()) {
12423+
MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
12424+
MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
12425+
Src = DAG.getBitcast(SrcVT, Src);
12426+
// Don't do this if PACKSS/PACKUS could perform it cheaper.
12427+
if (Scale == 2 &&
12428+
((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
12429+
(DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
12430+
return SDValue();
12431+
} else
1241812432
return SDValue();
12419-
Src = Src.getOperand(0);
1242012433

1242112434
// VPMOVWB is only available with avx512bw.
12422-
MVT SrcVT = Src.getSimpleValueType();
12423-
if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
12424-
!Subtarget.hasBWI())
12435+
if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
1242512436
return SDValue();
1242612437

1242712438
bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);

llvm/test/CodeGen/X86/avx512-trunc.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
187187
;
188188
; SKX-LABEL: trunc_qw_128:
189189
; SKX: ## %bb.0:
190-
; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
190+
; SKX-NEXT: vpmovqw %xmm0, %xmm0
191191
; SKX-NEXT: retq
192192
%x = trunc <2 x i64> %i to <2 x i16>
193193
ret <2 x i16> %x

llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -739,8 +739,7 @@ define <2 x half> @test_s8tofp2(<2 x i8> %arg0) {
739739
define <2 x half> @test_u1tofp2(<2 x i1> %arg0) {
740740
; CHECK-LABEL: test_u1tofp2:
741741
; CHECK: # %bb.0:
742-
; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
743-
; CHECK-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
742+
; CHECK-NEXT: vpmovqw %xmm0, %xmm0
744743
; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
745744
; CHECK-NEXT: vcvtuw2ph %xmm0, %xmm0
746745
; CHECK-NEXT: retq

llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,9 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
7373
; AVX512-LABEL: vf4:
7474
; AVX512: # %bb.0:
7575
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
76-
; AVX512-NEXT: vpmovdw %xmm0, %xmm1
77-
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
78-
; AVX512-NEXT: vmovq %xmm1, (%rsi)
79-
; AVX512-NEXT: vmovq %xmm0, (%rdx)
76+
; AVX512-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
77+
; AVX512-NEXT: vpmovdw %xmm0, (%rsi)
78+
; AVX512-NEXT: vmovq %xmm1, (%rdx)
8079
; AVX512-NEXT: retq
8180
%wide.vec = load <8 x i16>, ptr %in.vec, align 32
8281

llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -42,16 +42,15 @@ define void @vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
4242
; AVX512-LABEL: vf2:
4343
; AVX512: # %bb.0:
4444
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
45-
; AVX512-NEXT: vpmovqw %xmm0, %xmm1
46-
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
47-
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
48-
; AVX512-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
49-
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
50-
; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
51-
; AVX512-NEXT: vmovd %xmm1, (%rsi)
52-
; AVX512-NEXT: vmovd %xmm2, (%rdx)
45+
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
46+
; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
47+
; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
48+
; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
49+
; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
50+
; AVX512-NEXT: vpmovqw %xmm0, (%rsi)
51+
; AVX512-NEXT: vmovd %xmm1, (%rdx)
5352
; AVX512-NEXT: vmovd %xmm3, (%rcx)
54-
; AVX512-NEXT: vmovd %xmm0, (%r8)
53+
; AVX512-NEXT: vmovd %xmm2, (%r8)
5554
; AVX512-NEXT: retq
5655
%wide.vec = load <8 x i16>, ptr %in.vec, align 32
5756

llvm/test/CodeGen/X86/vector-rotate-128.ll

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1935,13 +1935,21 @@ define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind {
19351935
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
19361936
; AVX-NEXT: retq
19371937
;
1938-
; AVX512-LABEL: rot16_trunc:
1939-
; AVX512: # %bb.0:
1940-
; AVX512-NEXT: vpsrld $11, %xmm0, %xmm1
1941-
; AVX512-NEXT: vpslld $5, %xmm0, %xmm0
1942-
; AVX512-NEXT: vpor %xmm0, %xmm1, %xmm0
1943-
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1944-
; AVX512-NEXT: retq
1938+
; AVX512NOVLX-LABEL: rot16_trunc:
1939+
; AVX512NOVLX: # %bb.0:
1940+
; AVX512NOVLX-NEXT: vpsrld $11, %xmm0, %xmm1
1941+
; AVX512NOVLX-NEXT: vpslld $5, %xmm0, %xmm0
1942+
; AVX512NOVLX-NEXT: vpor %xmm0, %xmm1, %xmm0
1943+
; AVX512NOVLX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1944+
; AVX512NOVLX-NEXT: retq
1945+
;
1946+
; AVX512VLX-LABEL: rot16_trunc:
1947+
; AVX512VLX: # %bb.0:
1948+
; AVX512VLX-NEXT: vpsrld $11, %xmm0, %xmm1
1949+
; AVX512VLX-NEXT: vpslld $5, %xmm0, %xmm0
1950+
; AVX512VLX-NEXT: vpor %xmm0, %xmm1, %xmm0
1951+
; AVX512VLX-NEXT: vpmovdw %xmm0, %xmm0
1952+
; AVX512VLX-NEXT: retq
19451953
;
19461954
; XOP-LABEL: rot16_trunc:
19471955
; XOP: # %bb.0:

0 commit comments

Comments
 (0)