Skip to content

Commit 9d1721c

Browse files
committed
[X86][SSE] Prefer PACKUS(AND(),AND()) to SHUFFLE(PSHUFB(),PSHUFB()) on pre-AVX2 targets
As discussed on PR31443, we should be trying to use PACKUS for binary truncation patterns to reduce the number of shuffles. The plan is to support AVX2+ targets once we've worked around PR45315 - we fail to peek through a VBROADCAST_LOAD mask to recognise zero upper bits in a PACKUS pattern. We should also be able to add support for v8i16 and possibly 256/512-bit vectors as well.
1 parent 3eef474 commit 9d1721c

9 files changed

+190
-328
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14817,6 +14817,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1481714817
Zeroable, Subtarget, DAG))
1481814818
return V;
1481914819

14820+
// Check for compaction patterns.
14821+
bool IsSingleInput = V2.isUndef();
14822+
int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
14823+
1482014824
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
1482114825
// with PSHUFB. It is important to do this before we attempt to generate any
1482214826
// blends but after all of the single-input lowerings. If the single input
@@ -14827,10 +14831,16 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1482714831
// and there are *very* few patterns that would actually be faster than the
1482814832
// PSHUFB approach because of its ability to zero lanes.
1482914833
//
14834+
// If the mask is a binary compaction, we can more efficiently perform this
14835+
// as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
14836+
// TODO: AVX2+ sees a regression as they fail to see through VBROADCAST_LOAD
14837+
// masks.
14838+
//
1483014839
// FIXME: The only exceptions to the above are blends which are exact
1483114840
// interleavings with direct instructions supporting them. We currently don't
1483214841
// handle those well here.
14833-
if (Subtarget.hasSSSE3()) {
14842+
if (Subtarget.hasSSSE3() &&
14843+
(Subtarget.hasInt256() || IsSingleInput || NumEvenDrops != 1)) {
1483414844
bool V1InUse = false;
1483514845
bool V2InUse = false;
1483614846

@@ -14888,8 +14898,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
1488814898
// We special case these as they can be particularly efficiently handled with
1488914899
// the PACKUSB instruction on x86 and they show up in common patterns of
1489014900
// rearranging bytes to truncate wide elements.
14891-
bool IsSingleInput = V2.isUndef();
14892-
if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
14901+
if (NumEvenDrops) {
1489314902
// NumEvenDrops is the power of two stride of the elements. Another way of
1489414903
// thinking about it is that we need to drop the even elements this many
1489514904
// times to get the original input.

llvm/test/CodeGen/X86/masked_store_trunc.ll

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -4652,10 +4652,10 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma
46524652
; SSE4-LABEL: truncstore_v32i16_v32i8:
46534653
; SSE4: # %bb.0:
46544654
; SSE4-NEXT: pxor %xmm7, %xmm7
4655-
; SSE4-NEXT: movdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
4656-
; SSE4-NEXT: pshufb %xmm6, %xmm1
4657-
; SSE4-NEXT: pshufb %xmm6, %xmm0
4658-
; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
4655+
; SSE4-NEXT: movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
4656+
; SSE4-NEXT: pand %xmm6, %xmm1
4657+
; SSE4-NEXT: pand %xmm6, %xmm0
4658+
; SSE4-NEXT: packuswb %xmm1, %xmm0
46594659
; SSE4-NEXT: pcmpeqb %xmm7, %xmm4
46604660
; SSE4-NEXT: pmovmskb %xmm4, %ecx
46614661
; SSE4-NEXT: xorl $65535, %ecx # imm = 0xFFFF
@@ -4711,14 +4711,14 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma
47114711
; SSE4-NEXT: .LBB15_29: # %cond.store27
47124712
; SSE4-NEXT: pextrb $14, %xmm0, 14(%rdi)
47134713
; SSE4-NEXT: .LBB15_30: # %else28
4714-
; SSE4-NEXT: pshufb %xmm6, %xmm3
4715-
; SSE4-NEXT: pshufb %xmm6, %xmm2
4714+
; SSE4-NEXT: pand %xmm6, %xmm3
4715+
; SSE4-NEXT: pand %xmm6, %xmm2
47164716
; SSE4-NEXT: testl $32768, %eax # imm = 0x8000
47174717
; SSE4-NEXT: je .LBB15_32
47184718
; SSE4-NEXT: # %bb.31: # %cond.store29
47194719
; SSE4-NEXT: pextrb $15, %xmm0, 15(%rdi)
47204720
; SSE4-NEXT: .LBB15_32: # %else30
4721-
; SSE4-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
4721+
; SSE4-NEXT: packuswb %xmm3, %xmm2
47224722
; SSE4-NEXT: testl $65536, %eax # imm = 0x10000
47234723
; SSE4-NEXT: jne .LBB15_33
47244724
; SSE4-NEXT: # %bb.34: # %else32
@@ -5750,10 +5750,10 @@ define void @truncstore_v16i16_v16i8(<16 x i16> %x, <16 x i8>* %p, <16 x i8> %ma
57505750
; SSE4-LABEL: truncstore_v16i16_v16i8:
57515751
; SSE4: # %bb.0:
57525752
; SSE4-NEXT: pxor %xmm3, %xmm3
5753-
; SSE4-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
5754-
; SSE4-NEXT: pshufb %xmm4, %xmm1
5755-
; SSE4-NEXT: pshufb %xmm4, %xmm0
5756-
; SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
5753+
; SSE4-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
5754+
; SSE4-NEXT: pand %xmm4, %xmm1
5755+
; SSE4-NEXT: pand %xmm4, %xmm0
5756+
; SSE4-NEXT: packuswb %xmm1, %xmm0
57575757
; SSE4-NEXT: pcmpeqb %xmm2, %xmm3
57585758
; SSE4-NEXT: pmovmskb %xmm3, %eax
57595759
; SSE4-NEXT: xorl $65535, %eax # imm = 0xFFFF

llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,25 @@
1313
; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
1414

1515
define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
16-
; AVX-LABEL: shuffle_v32i8_to_v16i8:
17-
; AVX: # %bb.0:
18-
; AVX-NEXT: vmovdqa (%rdi), %xmm0
19-
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
20-
; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
21-
; AVX-NEXT: vpshufb %xmm2, %xmm1, %xmm1
22-
; AVX-NEXT: vpshufb %xmm2, %xmm0, %xmm0
23-
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
24-
; AVX-NEXT: vmovdqa %xmm0, (%rsi)
25-
; AVX-NEXT: retq
16+
; AVX1-LABEL: shuffle_v32i8_to_v16i8:
17+
; AVX1: # %bb.0:
18+
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
19+
; AVX1-NEXT: vpand 16(%rdi), %xmm0, %xmm1
20+
; AVX1-NEXT: vpand (%rdi), %xmm0, %xmm0
21+
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
22+
; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
23+
; AVX1-NEXT: retq
24+
;
25+
; AVX2-LABEL: shuffle_v32i8_to_v16i8:
26+
; AVX2: # %bb.0:
27+
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
28+
; AVX2-NEXT: vmovdqa 16(%rdi), %xmm1
29+
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
30+
; AVX2-NEXT: vpshufb %xmm2, %xmm1, %xmm1
31+
; AVX2-NEXT: vpshufb %xmm2, %xmm0, %xmm0
32+
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
33+
; AVX2-NEXT: vmovdqa %xmm0, (%rsi)
34+
; AVX2-NEXT: retq
2635
;
2736
; AVX512F-LABEL: shuffle_v32i8_to_v16i8:
2837
; AVX512F: # %bb.0:

llvm/test/CodeGen/X86/vector-reduce-and-bool.ll

Lines changed: 26 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -356,29 +356,17 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
356356
}
357357

358358
define i1 @trunc_v16i16_v16i1(<16 x i16>) {
359-
; SSE2-LABEL: trunc_v16i16_v16i1:
360-
; SSE2: # %bb.0:
361-
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
362-
; SSE2-NEXT: pand %xmm2, %xmm1
363-
; SSE2-NEXT: pand %xmm2, %xmm0
364-
; SSE2-NEXT: packuswb %xmm1, %xmm0
365-
; SSE2-NEXT: psllw $7, %xmm0
366-
; SSE2-NEXT: pmovmskb %xmm0, %eax
367-
; SSE2-NEXT: cmpw $-1, %ax
368-
; SSE2-NEXT: sete %al
369-
; SSE2-NEXT: retq
370-
;
371-
; SSE41-LABEL: trunc_v16i16_v16i1:
372-
; SSE41: # %bb.0:
373-
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
374-
; SSE41-NEXT: pshufb %xmm2, %xmm1
375-
; SSE41-NEXT: pshufb %xmm2, %xmm0
376-
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
377-
; SSE41-NEXT: psllw $7, %xmm0
378-
; SSE41-NEXT: pmovmskb %xmm0, %eax
379-
; SSE41-NEXT: cmpw $-1, %ax
380-
; SSE41-NEXT: sete %al
381-
; SSE41-NEXT: retq
359+
; SSE-LABEL: trunc_v16i16_v16i1:
360+
; SSE: # %bb.0:
361+
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
362+
; SSE-NEXT: pand %xmm2, %xmm1
363+
; SSE-NEXT: pand %xmm2, %xmm0
364+
; SSE-NEXT: packuswb %xmm1, %xmm0
365+
; SSE-NEXT: psllw $7, %xmm0
366+
; SSE-NEXT: pmovmskb %xmm0, %eax
367+
; SSE-NEXT: cmpw $-1, %ax
368+
; SSE-NEXT: sete %al
369+
; SSE-NEXT: retq
382370
;
383371
; AVX1-LABEL: trunc_v16i16_v16i1:
384372
; AVX1: # %bb.0:
@@ -695,37 +683,21 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) {
695683
}
696684

697685
define i1 @trunc_v32i16_v32i1(<32 x i16>) {
698-
; SSE2-LABEL: trunc_v32i16_v32i1:
699-
; SSE2: # %bb.0:
700-
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
701-
; SSE2-NEXT: pand %xmm4, %xmm3
702-
; SSE2-NEXT: pand %xmm4, %xmm2
703-
; SSE2-NEXT: packuswb %xmm3, %xmm2
704-
; SSE2-NEXT: pand %xmm4, %xmm1
705-
; SSE2-NEXT: pand %xmm4, %xmm0
706-
; SSE2-NEXT: packuswb %xmm1, %xmm0
707-
; SSE2-NEXT: pand %xmm2, %xmm0
708-
; SSE2-NEXT: psllw $7, %xmm0
709-
; SSE2-NEXT: pmovmskb %xmm0, %eax
710-
; SSE2-NEXT: cmpw $-1, %ax
711-
; SSE2-NEXT: sete %al
712-
; SSE2-NEXT: retq
713-
;
714-
; SSE41-LABEL: trunc_v32i16_v32i1:
715-
; SSE41: # %bb.0:
716-
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
717-
; SSE41-NEXT: pshufb %xmm4, %xmm3
718-
; SSE41-NEXT: pshufb %xmm4, %xmm2
719-
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
720-
; SSE41-NEXT: pshufb %xmm4, %xmm1
721-
; SSE41-NEXT: pshufb %xmm4, %xmm0
722-
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
723-
; SSE41-NEXT: pand %xmm2, %xmm0
724-
; SSE41-NEXT: psllw $7, %xmm0
725-
; SSE41-NEXT: pmovmskb %xmm0, %eax
726-
; SSE41-NEXT: cmpw $-1, %ax
727-
; SSE41-NEXT: sete %al
728-
; SSE41-NEXT: retq
686+
; SSE-LABEL: trunc_v32i16_v32i1:
687+
; SSE: # %bb.0:
688+
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
689+
; SSE-NEXT: pand %xmm4, %xmm3
690+
; SSE-NEXT: pand %xmm4, %xmm2
691+
; SSE-NEXT: packuswb %xmm3, %xmm2
692+
; SSE-NEXT: pand %xmm4, %xmm1
693+
; SSE-NEXT: pand %xmm4, %xmm0
694+
; SSE-NEXT: packuswb %xmm1, %xmm0
695+
; SSE-NEXT: pand %xmm2, %xmm0
696+
; SSE-NEXT: psllw $7, %xmm0
697+
; SSE-NEXT: pmovmskb %xmm0, %eax
698+
; SSE-NEXT: cmpw $-1, %ax
699+
; SSE-NEXT: sete %al
700+
; SSE-NEXT: retq
729701
;
730702
; AVX1-LABEL: trunc_v32i16_v32i1:
731703
; AVX1: # %bb.0:

llvm/test/CodeGen/X86/vector-reduce-or-bool.ll

Lines changed: 26 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -350,29 +350,17 @@ define i1 @trunc_v8i32_v8i1(<8 x i32>) {
350350
}
351351

352352
define i1 @trunc_v16i16_v16i1(<16 x i16>) {
353-
; SSE2-LABEL: trunc_v16i16_v16i1:
354-
; SSE2: # %bb.0:
355-
; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
356-
; SSE2-NEXT: pand %xmm2, %xmm1
357-
; SSE2-NEXT: pand %xmm2, %xmm0
358-
; SSE2-NEXT: packuswb %xmm1, %xmm0
359-
; SSE2-NEXT: psllw $7, %xmm0
360-
; SSE2-NEXT: pmovmskb %xmm0, %eax
361-
; SSE2-NEXT: testw %ax, %ax
362-
; SSE2-NEXT: setne %al
363-
; SSE2-NEXT: retq
364-
;
365-
; SSE41-LABEL: trunc_v16i16_v16i1:
366-
; SSE41: # %bb.0:
367-
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
368-
; SSE41-NEXT: pshufb %xmm2, %xmm1
369-
; SSE41-NEXT: pshufb %xmm2, %xmm0
370-
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
371-
; SSE41-NEXT: psllw $7, %xmm0
372-
; SSE41-NEXT: pmovmskb %xmm0, %eax
373-
; SSE41-NEXT: testw %ax, %ax
374-
; SSE41-NEXT: setne %al
375-
; SSE41-NEXT: retq
353+
; SSE-LABEL: trunc_v16i16_v16i1:
354+
; SSE: # %bb.0:
355+
; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
356+
; SSE-NEXT: pand %xmm2, %xmm1
357+
; SSE-NEXT: pand %xmm2, %xmm0
358+
; SSE-NEXT: packuswb %xmm1, %xmm0
359+
; SSE-NEXT: psllw $7, %xmm0
360+
; SSE-NEXT: pmovmskb %xmm0, %eax
361+
; SSE-NEXT: testw %ax, %ax
362+
; SSE-NEXT: setne %al
363+
; SSE-NEXT: retq
376364
;
377365
; AVX1-LABEL: trunc_v16i16_v16i1:
378366
; AVX1: # %bb.0:
@@ -689,37 +677,21 @@ define i1 @trunc_v16i32_v16i1(<16 x i32>) {
689677
}
690678

691679
define i1 @trunc_v32i16_v32i1(<32 x i16>) {
692-
; SSE2-LABEL: trunc_v32i16_v32i1:
693-
; SSE2: # %bb.0:
694-
; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
695-
; SSE2-NEXT: pand %xmm4, %xmm3
696-
; SSE2-NEXT: pand %xmm4, %xmm2
697-
; SSE2-NEXT: packuswb %xmm3, %xmm2
698-
; SSE2-NEXT: pand %xmm4, %xmm1
699-
; SSE2-NEXT: pand %xmm4, %xmm0
700-
; SSE2-NEXT: packuswb %xmm1, %xmm0
701-
; SSE2-NEXT: por %xmm2, %xmm0
702-
; SSE2-NEXT: psllw $7, %xmm0
703-
; SSE2-NEXT: pmovmskb %xmm0, %eax
704-
; SSE2-NEXT: testw %ax, %ax
705-
; SSE2-NEXT: setne %al
706-
; SSE2-NEXT: retq
707-
;
708-
; SSE41-LABEL: trunc_v32i16_v32i1:
709-
; SSE41: # %bb.0:
710-
; SSE41-NEXT: movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
711-
; SSE41-NEXT: pshufb %xmm4, %xmm3
712-
; SSE41-NEXT: pshufb %xmm4, %xmm2
713-
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
714-
; SSE41-NEXT: pshufb %xmm4, %xmm1
715-
; SSE41-NEXT: pshufb %xmm4, %xmm0
716-
; SSE41-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
717-
; SSE41-NEXT: por %xmm2, %xmm0
718-
; SSE41-NEXT: psllw $7, %xmm0
719-
; SSE41-NEXT: pmovmskb %xmm0, %eax
720-
; SSE41-NEXT: testw %ax, %ax
721-
; SSE41-NEXT: setne %al
722-
; SSE41-NEXT: retq
680+
; SSE-LABEL: trunc_v32i16_v32i1:
681+
; SSE: # %bb.0:
682+
; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
683+
; SSE-NEXT: pand %xmm4, %xmm3
684+
; SSE-NEXT: pand %xmm4, %xmm2
685+
; SSE-NEXT: packuswb %xmm3, %xmm2
686+
; SSE-NEXT: pand %xmm4, %xmm1
687+
; SSE-NEXT: pand %xmm4, %xmm0
688+
; SSE-NEXT: packuswb %xmm1, %xmm0
689+
; SSE-NEXT: por %xmm2, %xmm0
690+
; SSE-NEXT: psllw $7, %xmm0
691+
; SSE-NEXT: pmovmskb %xmm0, %eax
692+
; SSE-NEXT: testw %ax, %ax
693+
; SSE-NEXT: setne %al
694+
; SSE-NEXT: retq
723695
;
724696
; AVX1-LABEL: trunc_v32i16_v32i1:
725697
; AVX1: # %bb.0:

0 commit comments

Comments
 (0)