[X86] Lower blended PACKUSes using appropriate types.

ahmedbougacha · ahmedbougacha · commit 2ff396ff480e · 2016-06-30T07:42:47.000-07:00
When lowering two blended PACKUS, we used to disregard the types of the PACKUS inputs, indiscriminately generating a v16i8 PACKUS. This leads to non-selectable things like: (v16i8 (PACKUS (v4i32 v0), (v4i32 v1))) Instead, check that the PACKUSes have the same type, and use that as the final result type. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274138 91177308-0d34-0410-b5e6-96231b3b80d8 (cherry picked from commit cdfe078)
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
@@ -8771,24 +8771,27 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
   assert(Mask[0] < 2 && "We sort V1 to be the first input.");
   assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
 
-  // If we have a blend of two PACKUS operations an the blend aligns with the
-  // low and half halves, we can just merge the PACKUS operations. This is
-  // particularly important as it lets us merge shuffles that this routine itself
-  // creates.
+  // If we have a blend of two same-type PACKUS operations and the blend aligns
+  // with the low and high halves, we can just merge the PACKUS operations.
+  // This is particularly important as it lets us merge shuffles that this
+  // routine itself creates.
   auto GetPackNode = [](SDValue V) {
     while (V.getOpcode() == ISD::BITCAST)
       V = V.getOperand(0);
 
     return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
   };
   if (SDValue V1Pack = GetPackNode(V1))
-    if (SDValue V2Pack = GetPackNode(V2))
-      return DAG.getBitcast(MVT::v2i64,
-                            DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
-                                        Mask[0] == 0 ? V1Pack.getOperand(0)
-                                                     : V1Pack.getOperand(1),
-                                        Mask[1] == 2 ? V2Pack.getOperand(0)
-                                                     : V2Pack.getOperand(1)));
+    if (SDValue V2Pack = GetPackNode(V2)) {
+      EVT PackVT = V1Pack.getValueType();
+      if (PackVT == V2Pack.getValueType())
+        return DAG.getBitcast(MVT::v2i64,
+                              DAG.getNode(X86ISD::PACKUS, DL, PackVT,
+                                          Mask[0] == 0 ? V1Pack.getOperand(0)
+                                                       : V1Pack.getOperand(1),
+                                          Mask[1] == 2 ? V2Pack.getOperand(0)
+                                                       : V2Pack.getOperand(1)));
+    }
 
   // Try to use shift instructions.
   if (SDValue Shift =
diff --git a/test/CodeGen/X86/vector-shuffle-sse41.ll b/test/CodeGen/X86/vector-shuffle-sse41.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX
+
+define <8 x i16> @blend_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
+; SSE41-LABEL: blend_packusdw:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    packusdw %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_packusdw:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+  %p1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
+  %s0 = shufflevector <8 x i16> %p0, <8 x i16> %p1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %s0
+}
+
+define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
+; SSE41-LABEL: blend_packuswb:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    packuswb %xmm2, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_packuswb:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %p0 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
+  %p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
+  %s0 = shufflevector <16 x i8> %p0, <16 x i8> %p1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  ret <16 x i8> %s0
+}
+
+define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) {
+; SSE41-LABEL: blend_packusdw_packuswb:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    packusdw %xmm1, %xmm0
+; SSE41-NEXT:    packuswb %xmm3, %xmm2
+; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: blend_packusdw_packuswb:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vpackuswb %xmm3, %xmm2, %xmm1
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    retq
+  %p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+  %p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
+  %b1 = bitcast <16 x i8> %p1 to <8 x i16>
+  %s0 = shufflevector <8 x i16> %p0, <8 x i16> %b1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+  ret <8 x i16> %s0
+}
+
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)