Skip to content

Commit cdfe078

Browse files
committed
[X86] Lower blended PACKUSes using appropriate types.
When lowering two blended PACKUS, we used to disregard the types of the PACKUS inputs, indiscriminately generating a v16i8 PACKUS. This leads to non-selectable things like: (v16i8 (PACKUS (v4i32 v0), (v4i32 v1))) Instead, check that the PACKUSes have the same type, and use that as the final result type. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@274138 91177308-0d34-0410-b5e6-96231b3b80d8
1 parent f98b39d commit cdfe078

File tree

2 files changed

+73
-11
lines changed

2 files changed

+73
-11
lines changed

lib/Target/X86/X86ISelLowering.cpp

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8943,22 +8943,25 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
89438943
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
89448944
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
89458945

8946-
// If we have a blend of two PACKUS operations an the blend aligns with the
8947-
// low and half halves, we can just merge the PACKUS operations. This is
8948-
// particularly important as it lets us merge shuffles that this routine itself
8949-
// creates.
8946+
// If we have a blend of two same-type PACKUS operations and the blend aligns
8947+
// with the low and high halves, we can just merge the PACKUS operations.
8948+
// This is particularly important as it lets us merge shuffles that this
8949+
// routine itself creates.
89508950
auto GetPackNode = [](SDValue V) {
89518951
V = peekThroughBitcasts(V);
89528952
return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
89538953
};
89548954
if (SDValue V1Pack = GetPackNode(V1))
8955-
if (SDValue V2Pack = GetPackNode(V2))
8956-
return DAG.getBitcast(MVT::v2i64,
8957-
DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8,
8958-
Mask[0] == 0 ? V1Pack.getOperand(0)
8959-
: V1Pack.getOperand(1),
8960-
Mask[1] == 2 ? V2Pack.getOperand(0)
8961-
: V2Pack.getOperand(1)));
8955+
if (SDValue V2Pack = GetPackNode(V2)) {
8956+
EVT PackVT = V1Pack.getValueType();
8957+
if (PackVT == V2Pack.getValueType())
8958+
return DAG.getBitcast(MVT::v2i64,
8959+
DAG.getNode(X86ISD::PACKUS, DL, PackVT,
8960+
Mask[0] == 0 ? V1Pack.getOperand(0)
8961+
: V1Pack.getOperand(1),
8962+
Mask[1] == 2 ? V2Pack.getOperand(0)
8963+
: V2Pack.getOperand(1)));
8964+
}
89628965

89638966
// Try to use shift instructions.
89648967
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE41
3+
; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX
4+
5+
define <8 x i16> @blend_packusdw(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3) {
6+
; SSE41-LABEL: blend_packusdw:
7+
; SSE41: # BB#0:
8+
; SSE41-NEXT: packusdw %xmm2, %xmm0
9+
; SSE41-NEXT: retq
10+
;
11+
; AVX-LABEL: blend_packusdw:
12+
; AVX: # BB#0:
13+
; AVX-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
14+
; AVX-NEXT: retq
15+
%p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
16+
%p1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a2, <4 x i32> %a3)
17+
%s0 = shufflevector <8 x i16> %p0, <8 x i16> %p1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
18+
ret <8 x i16> %s0
19+
}
20+
21+
define <16 x i8> @blend_packuswb(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> %a2, <8 x i16> %a3) {
22+
; SSE41-LABEL: blend_packuswb:
23+
; SSE41: # BB#0:
24+
; SSE41-NEXT: packuswb %xmm2, %xmm0
25+
; SSE41-NEXT: retq
26+
;
27+
; AVX-LABEL: blend_packuswb:
28+
; AVX: # BB#0:
29+
; AVX-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
30+
; AVX-NEXT: retq
31+
%p0 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
32+
%p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
33+
%s0 = shufflevector <16 x i8> %p0, <16 x i8> %p1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
34+
ret <16 x i8> %s0
35+
}
36+
37+
define <8 x i16> @blend_packusdw_packuswb(<4 x i32> %a0, <4 x i32> %a1, <8 x i16> %a2, <8 x i16> %a3) {
38+
; SSE41-LABEL: blend_packusdw_packuswb:
39+
; SSE41: # BB#0:
40+
; SSE41-NEXT: packusdw %xmm1, %xmm0
41+
; SSE41-NEXT: packuswb %xmm3, %xmm2
42+
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
43+
; SSE41-NEXT: retq
44+
;
45+
; AVX-LABEL: blend_packusdw_packuswb:
46+
; AVX: # BB#0:
47+
; AVX-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
48+
; AVX-NEXT: vpackuswb %xmm3, %xmm2, %xmm1
49+
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
50+
; AVX-NEXT: retq
51+
%p0 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
52+
%p1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a2, <8 x i16> %a3)
53+
%b1 = bitcast <16 x i8> %p1 to <8 x i16>
54+
%s0 = shufflevector <8 x i16> %p0, <8 x i16> %b1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
55+
ret <8 x i16> %s0
56+
}
57+
58+
declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
59+
declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)

0 commit comments

Comments
 (0)