[X86] lowerShuffleWithVPMOV - support direct lowering to VPMOV on VLX targets

RKSimon · RKSimon · commit 6ba5fc2deedd · 2022-08-11T17:40:07.000+01:00
lowerShuffleWithVPMOV currently only matches shuffle(truncate(x)) patterns, but on VLX targets the truncate isn't usually necessary to make the VPMOV node worthwhile (as we're only targetting v16i8/v8i16 shuffles we're almost always ending up with a PSHUFB node instead). PACKSS/PACKUS are still preferred vs VPMOV due to their lower uop count.

Fixes the remaining regression from the fixes in rG293899c64b75
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12406,22 +12406,33 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1,
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
   unsigned MaxScale = 64 / EltSizeInBits;
   for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+    unsigned SrcEltBits = EltSizeInBits * Scale;
     unsigned NumSrcElts = NumElts / Scale;
     unsigned UpperElts = NumElts - NumSrcElts;
     if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) ||
         !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes())
       continue;
 
+    // Attempt to find a matching source truncation, but as a fall back VLX
+    // cases can use the VPMOV directly.
     SDValue Src = peekThroughBitcasts(V1);
-    if (Src.getOpcode() != ISD::TRUNCATE ||
-        Src.getScalarValueSizeInBits() != (EltSizeInBits * Scale))
+    if (Src.getOpcode() == ISD::TRUNCATE &&
+        Src.getScalarValueSizeInBits() == SrcEltBits) {
+      Src = Src.getOperand(0);
+    } else if (Subtarget.hasVLX()) {
+      MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+      MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+      Src = DAG.getBitcast(SrcVT, Src);
+      // Don't do this if PACKSS/PACKUS could perform it cheaper.
+      if (Scale == 2 &&
+          ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) ||
+           (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits)))
+        return SDValue();
+    } else
       return SDValue();
-    Src = Src.getOperand(0);
 
     // VPMOVWB is only available with avx512bw.
-    MVT SrcVT = Src.getSimpleValueType();
-    if (SrcVT.getVectorElementType() == MVT::i16 && VT == MVT::v16i8 &&
-        !Subtarget.hasBWI())
+    if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32)
       return SDValue();
 
     bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts);
diff --git a/llvm/test/CodeGen/X86/avx512-trunc.ll b/llvm/test/CodeGen/X86/avx512-trunc.ll
@@ -187,7 +187,7 @@ define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {
 ;
 ; SKX-LABEL: trunc_qw_128:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
+; SKX-NEXT:    vpmovqw %xmm0, %xmm0
 ; SKX-NEXT:    retq
   %x = trunc <2 x i64> %i to <2 x i16>
   ret <2 x i16> %x
diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt-ph-w-vl-intrinsics.ll
@@ -739,8 +739,7 @@ define <2 x half> @test_s8tofp2(<2 x i8> %arg0) {
 define <2 x half> @test_u1tofp2(<2 x i1> %arg0) {
 ; CHECK-LABEL: test_u1tofp2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; CHECK-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; CHECK-NEXT:    vpmovqw %xmm0, %xmm0
 ; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; CHECK-NEXT:    vcvtuw2ph %xmm0, %xmm0
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -73,10 +73,9 @@ define void @vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nounwind {
 ; AVX512-LABEL: vf4:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpmovdw %xmm0, %xmm1
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
-; AVX512-NEXT:    vmovq %xmm1, (%rsi)
-; AVX512-NEXT:    vmovq %xmm0, (%rdx)
+; AVX512-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
+; AVX512-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512-NEXT:    vmovq %xmm1, (%rdx)
 ; AVX512-NEXT:    retq
   %wide.vec = load <8 x i16>, ptr %in.vec, align 32
 
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -42,16 +42,15 @@ define void @vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %
 ; AVX512-LABEL: vf2:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpmovqw %xmm0, %xmm1
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,0,2,3,4,5,6,7]
-; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; AVX512-NEXT:    vmovd %xmm1, (%rsi)
-; AVX512-NEXT:    vmovd %xmm2, (%rdx)
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[2,0,2,3,4,5,6,7]
+; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; AVX512-NEXT:    vpmovqw %xmm0, (%rsi)
+; AVX512-NEXT:    vmovd %xmm1, (%rdx)
 ; AVX512-NEXT:    vmovd %xmm3, (%rcx)
-; AVX512-NEXT:    vmovd %xmm0, (%r8)
+; AVX512-NEXT:    vmovd %xmm2, (%r8)
 ; AVX512-NEXT:    retq
   %wide.vec = load <8 x i16>, ptr %in.vec, align 32
 
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -1935,13 +1935,21 @@ define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: rot16_trunc:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpsrld $11, %xmm0, %xmm1
-; AVX512-NEXT:    vpslld $5, %xmm0, %xmm0
-; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; AVX512-NEXT:    retq
+; AVX512NOVLX-LABEL: rot16_trunc:
+; AVX512NOVLX:       # %bb.0:
+; AVX512NOVLX-NEXT:    vpsrld $11, %xmm0, %xmm1
+; AVX512NOVLX-NEXT:    vpslld $5, %xmm0, %xmm0
+; AVX512NOVLX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512NOVLX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512NOVLX-NEXT:    retq
+;
+; AVX512VLX-LABEL: rot16_trunc:
+; AVX512VLX:       # %bb.0:
+; AVX512VLX-NEXT:    vpsrld $11, %xmm0, %xmm1
+; AVX512VLX-NEXT:    vpslld $5, %xmm0, %xmm0
+; AVX512VLX-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512VLX-NEXT:    vpmovdw %xmm0, %xmm0
+; AVX512VLX-NEXT:    retq
 ;
 ; XOP-LABEL: rot16_trunc:
 ; XOP:       # %bb.0:

Original file line number	Diff line number	Diff line change
`@@ -187,7 +187,7 @@ define <2 x i16> @trunc_qw_128(<2 x i64> %i) #0 {`
`187`	`187`	`;`
`188`	`188`	`; SKX-LABEL: trunc_qw_128:`
`189`	`189`	`; SKX: ## %bb.0:`
`190`		`-; SKX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]`
	`190`	`+; SKX-NEXT: vpmovqw %xmm0, %xmm0`
`191`	`191`	`; SKX-NEXT: retq`
`192`	`192`	`%x = trunc <2 x i64> %i to <2 x i16>`
`193`	`193`	`ret <2 x i16> %x`