Skip to content

Commit 2ff396f

Browse files
committed
[X86] Lower blended PACKUSes using appropriate types.
When lowering two blended PACKUS, we used to disregard the types of the PACKUS inputs, indiscriminately generating a v16i8 PACKUS. This leads to non-selectable things like: (v16i8 (PACKUS (v4i32 v0), (v4i32 v1))) Instead, check that the PACKUSes have the same type, and use that as the final result type. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274138 91177308-0d34-0410-b5e6-96231b3b80d8 (cherry picked from commit cdfe078)
1 parent 892e6cd commit 2ff396f

File tree

2 files changed

+73
-11
lines changed

2 files changed

+73
-11
lines changed

lib/Target/X86/X86ISelLowering.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8771,24 +8771,27 @@ static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
87718771
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
87728772
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
87738773

8774-
// If we have a blend of two PACKUS operations an the blend aligns with the
8775-
// low and half halves, we can just merge the PACKUS operations. This is
8776-
// particularly important as it lets us merge shuffles that this routine itself
8777-
// creates.
8774+
// If we have a blend of two same-type PACKUS operations and the blend aligns
8775+
// with the low and high halves, we can just merge the PACKUS operations.
8776+
// This is particularly important as it lets us merge shuffles that this
8777+
// routine itself creates.
87788778
auto GetPackNode = [](SDValue V) {
87798779
while (V.getOpcode() == ISD::BITCAST)
87808780
V = V.getOperand(0);
87818781

87828782
return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
87838783
};
87848784
if (SDValue V1Pack = GetPackNode(V1))
8785-
if (SDValue V2Pack = GetPackNode(V2))
8786-
return DAG.getBitcast(MVT::v2i64,
8787-
DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
8788-
Mask[0] == 0 ? V1Pack.getOperand(0)
8789-
: V1Pack.getOperand(1),
8790-
Mask[1] == 2 ? V2Pack.getOperand(0)
8791-
: V2Pack.getOperand(1)));
8785+
if (SDValue V2Pack = GetPackNode(V2)) {
8786+
EVT PackVT = V1Pack.getValueType();
8787+
if (PackVT == V2Pack.getValueType())
8788+
return DAG.getBitcast(MVT::v2i64,
8789+
DAG.getNode(X86ISD::PACKUS, DL, PackVT,
8790+
Mask[0] == 0 ? V1Pack.getOperand(0)
8791+
: V1Pack.getOperand(1),
8792+
Mask[1] == 2 ? V2Pack.getOperand(0)
8793+
: V2Pack.getOperand(1)));
8794+
}
87928795

87938796
// Try to use shift instructions.
87948797
if (SDValue Shift =
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
3+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX
4+
5+
define <8 x i16> @blend_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
6+
; SSE41-LABEL: blend_packusdw:
7+
; SSE41: # BB#0:
8+
; SSE41-NEXT: packusdw %xmm2, %xmm0
9+
; SSE41-NEXT: retq
10+
;
11+
; AVX-LABEL: blend_packusdw:
12+
; AVX: # BB#0:
13+
; AVX-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
14+
; AVX-NEXT: retq
15+
%p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
16+
%p1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
17+
%s0 = shufflevector <8 x i16> %p0, <8 x i16> %p1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
18+
ret <8 x i16> %s0
19+
}
20+
21+
define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
22+
; SSE41-LABEL: blend_packuswb:
23+
; SSE41: # BB#0:
24+
; SSE41-NEXT: packuswb %xmm2, %xmm0
25+
; SSE41-NEXT: retq
26+
;
27+
; AVX-LABEL: blend_packuswb:
28+
; AVX: # BB#0:
29+
; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
30+
; AVX-NEXT: retq
31+
%p0 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
32+
%p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
33+
%s0 = shufflevector <16 x i8> %p0, <16 x i8> %p1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
34+
ret <16 x i8> %s0
35+
}
36+
37+
define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) {
38+
; SSE41-LABEL: blend_packusdw_packuswb:
39+
; SSE41: # BB#0:
40+
; SSE41-NEXT: packusdw %xmm1, %xmm0
41+
; SSE41-NEXT: packuswb %xmm3, %xmm2
42+
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
43+
; SSE41-NEXT: retq
44+
;
45+
; AVX-LABEL: blend_packusdw_packuswb:
46+
; AVX: # BB#0:
47+
; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
48+
; AVX-NEXT: vpackuswb %xmm3, %xmm2, %xmm1
49+
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
50+
; AVX-NEXT: retq
51+
%p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
52+
%p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
53+
%b1 = bitcast <16 x i8> %p1 to <8 x i16>
54+
%s0 = shufflevector <8 x i16> %p0, <8 x i16> %b1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
55+
ret <8 x i16> %s0
56+
}
57+
58+
declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
59+
declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)

0 commit comments

Comments
 (0)