[X86] ReplaceNodeResults - truncate sub-128-bit vectors as shuffles directly (#83120)

RKSimon · web-flow · commit 13c359aa9b5e · 2024-02-27T15:03:42.000Z
We were scalarizing these truncations, but in most cases we can widen the source vector to 128-bits and perform the truncation as a shuffle directly (which will usually lower as a PACK or PSHUFB). For the cases where the widening and shuffle isn't legal we can leave it to generic legalization to scalarize for us. Fixes #81883
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32341,20 +32341,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       }
     }
 
-    if (128 % InBits == 0) {
+    if ((128 % InBits) == 0 && WidenVT.is128BitVector()) {
       // 128 bit and smaller inputs should avoid truncate all together and
-      // just use a build_vector that will become a shuffle.
-      // TODO: Widen and use a shuffle directly?
-      SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
-      // Use the original element count so we don't do more scalar opts than
-      // necessary.
-      for (unsigned i=0; i < MinElts; ++i) {
-        SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In,
-                                  DAG.getIntPtrConstant(i, dl));
-        Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val);
-      }
-      Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops));
-      return;
+      // use a shuffle.
+      if ((InEltVT.getSizeInBits() % EltVT.getSizeInBits()) == 0) {
+        int Scale = InEltVT.getSizeInBits() / EltVT.getSizeInBits();
+        SmallVector<int, 16> TruncMask(WidenNumElts, -1);
+        for (unsigned I = 0; I < MinElts; ++I)
+          TruncMask[I] = Scale * I;
+        SDValue WidenIn = widenSubVector(In, false, Subtarget, DAG, dl, 128);
+        assert(isTypeLegal(WidenVT) && isTypeLegal(WidenIn.getValueType()) &&
+               "Illegal vector type in truncation");
+        WidenIn = DAG.getBitcast(WidenVT, WidenIn);
+        Results.push_back(
+            DAG.getVectorShuffle(WidenVT, dl, WidenIn, WidenIn, TruncMask));
+        return;
+      }
     }
 
     // With AVX512 there are some cases that can use a target specific
diff --git a/llvm/test/CodeGen/X86/extract-concat.ll b/llvm/test/CodeGen/X86/extract-concat.ll
@@ -9,22 +9,17 @@ define void @foo(<4 x float> %in, ptr %out) {
 ; SSE2-LABEL: foo:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE2-NEXT:    shll $8, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    movl -{{[0-9]+}}(%rsp), %eax
-; SSE2-NEXT:    shll $16, %eax
-; SSE2-NEXT:    orl %ecx, %eax
-; SSE2-NEXT:    orl $-16777216, %eax # imm = 0xFF000000
-; SSE2-NEXT:    movl %eax, (%rdi)
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    movd %xmm0, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: foo:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    cvttps2dq %xmm0, %xmm0
-; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    movl $255, %eax
 ; SSE42-NEXT:    pinsrb $3, %eax, %xmm0
 ; SSE42-NEXT:    movd %xmm0, (%rdi)
@@ -33,7 +28,7 @@ define void @foo(<4 x float> %in, ptr %out) {
 ; AVX-LABEL: foo:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0
-; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    movl $255, %eax
 ; AVX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, (%rdi)
diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -112,27 +112,10 @@ define <4 x i8> @func_8_16(ptr %a, ptr %b) nounwind {
 ;
 ; X64-LABEL: func_8_16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq (%rdi), %rax
-; X64-NEXT:    vmovd %eax, %xmm0
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shrl $16, %ecx
-; X64-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $32, %rcx
-; X64-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
-; X64-NEXT:    shrq $48, %rax
-; X64-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; X64-NEXT:    movq (%rsi), %rax
-; X64-NEXT:    vmovd %eax, %xmm1
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    shrl $16, %ecx
-; X64-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $32, %rcx
-; X64-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
-; X64-NEXT:    shrq $48, %rax
-; X64-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; X64-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; X64-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
+; X64-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X64-NEXT:    retq
   %F = load <4 x i16>, ptr %a
   %G = trunc <4 x i16> %F to <4 x i8>
diff --git a/llvm/test/CodeGen/X86/vec_cast.ll b/llvm/test/CodeGen/X86/vec_cast.ll
@@ -156,7 +156,7 @@ define <3 x i16> @h(<3 x i32> %a) nounwind {
 ; CHECK-WIN-LABEL: h:
 ; CHECK-WIN:       # %bb.0:
 ; CHECK-WIN-NEXT:    movdqa (%rcx), %xmm0
-; CHECK-WIN-NEXT:    movl (%rcx), %eax
+; CHECK-WIN-NEXT:    movd %xmm0, %eax
 ; CHECK-WIN-NEXT:    pextrw $2, %xmm0, %edx
 ; CHECK-WIN-NEXT:    pextrw $4, %xmm0, %ecx
 ; CHECK-WIN-NEXT:    # kill: def $ax killed $ax killed $eax