Skip to content

Commit 012c4b3

Browse files
RKSimonyuxuanchen1997
authored andcommitted
[DAG] combineAndnp - fold ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
Summary: Based off #96754 - if we are zeroing out whole elements, then fold into the PSHUFB mask instead (if MSB is set the element is zero). Allows us to replace the non-commutative ANDNP with a much simpler OR node. Move this after ANDNP shuffle combining as that can do a better job of this for constant ANDNP + PSHUFB masks. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60250657
1 parent 3117f18 commit 012c4b3

File tree

2 files changed

+40
-38
lines changed

2 files changed

+40
-38
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -53482,13 +53482,6 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
5348253482
if (SDValue Not = IsNOT(N0, DAG))
5348353483
return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
5348453484

53485-
// Fold for better commutativity:
53486-
// ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
53487-
if (N1->hasOneUse())
53488-
if (SDValue Not = IsNOT(N1, DAG))
53489-
return DAG.getNOT(
53490-
DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
53491-
5349253485
// Constant Folding
5349353486
APInt Undefs0, Undefs1;
5349453487
SmallVector<APInt> EltBits0, EltBits1;
@@ -53566,6 +53559,28 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
5356653559
}
5356753560
}
5356853561

53562+
// Folds for better commutativity:
53563+
if (N1->hasOneUse()) {
53564+
// ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
53565+
if (SDValue Not = IsNOT(N1, DAG))
53566+
return DAG.getNOT(
53567+
DL, DAG.getNode(ISD::OR, DL, VT, N0, DAG.getBitcast(VT, Not)), VT);
53568+
53569+
// ANDNP(x,PSHUFB(y,z)) -> PSHUFB(y,OR(z,x))
53570+
// Zero out elements by setting the PSHUFB mask value to 0xFF.
53571+
if (DAG.ComputeNumSignBits(N0) == EltSizeInBits) {
53572+
SDValue BC1 = peekThroughOneUseBitcasts(N1);
53573+
if (BC1.getOpcode() == X86ISD::PSHUFB) {
53574+
EVT ShufVT = BC1.getValueType();
53575+
SDValue NewMask = DAG.getNode(ISD::OR, DL, ShufVT, BC1.getOperand(1),
53576+
DAG.getBitcast(ShufVT, N0));
53577+
SDValue NewShuf =
53578+
DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, BC1.getOperand(0), NewMask);
53579+
return DAG.getBitcast(VT, NewShuf);
53580+
}
53581+
}
53582+
}
53583+
5356953584
return SDValue();
5357053585
}
5357153586

llvm/test/CodeGen/X86/var-permute-128.ll

Lines changed: 18 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -309,9 +309,8 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
309309
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
310310
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
311311
; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
312+
; SSSE3-NEXT: por %xmm2, %xmm1
312313
; SSSE3-NEXT: pshufb %xmm1, %xmm0
313-
; SSSE3-NEXT: pandn %xmm0, %xmm2
314-
; SSSE3-NEXT: movdqa %xmm2, %xmm0
315314
; SSSE3-NEXT: retq
316315
;
317316
; SSE41-LABEL: var_shuffle_zero_v4i32:
@@ -322,9 +321,8 @@ define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounw
322321
; SSE41-NEXT: por %xmm2, %xmm1
323322
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
324323
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
324+
; SSE41-NEXT: por %xmm2, %xmm1
325325
; SSE41-NEXT: pshufb %xmm1, %xmm0
326-
; SSE41-NEXT: pandn %xmm0, %xmm2
327-
; SSE41-NEXT: movdqa %xmm2, %xmm0
328326
; SSE41-NEXT: retq
329327
;
330328
; XOP-LABEL: var_shuffle_zero_v4i32:
@@ -545,16 +543,15 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
545543
;
546544
; SSSE3-LABEL: var_shuffle_zero_v8i16:
547545
; SSSE3: # %bb.0:
548-
; SSSE3-NEXT: movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8]
549-
; SSSE3-NEXT: psubusw %xmm1, %xmm3
550-
; SSSE3-NEXT: pxor %xmm2, %xmm2
551-
; SSSE3-NEXT: pcmpeqw %xmm3, %xmm2
552-
; SSSE3-NEXT: por %xmm2, %xmm1
546+
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
547+
; SSSE3-NEXT: psubusw %xmm1, %xmm2
548+
; SSSE3-NEXT: pxor %xmm3, %xmm3
549+
; SSSE3-NEXT: pcmpeqw %xmm2, %xmm3
550+
; SSSE3-NEXT: por %xmm3, %xmm1
553551
; SSSE3-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
554552
; SSSE3-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
553+
; SSSE3-NEXT: por %xmm3, %xmm1
555554
; SSSE3-NEXT: pshufb %xmm1, %xmm0
556-
; SSSE3-NEXT: pandn %xmm0, %xmm2
557-
; SSSE3-NEXT: movdqa %xmm2, %xmm0
558555
; SSSE3-NEXT: retq
559556
;
560557
; SSE41-LABEL: var_shuffle_zero_v8i16:
@@ -565,9 +562,8 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
565562
; SSE41-NEXT: por %xmm2, %xmm1
566563
; SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
567564
; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
565+
; SSE41-NEXT: por %xmm2, %xmm1
568566
; SSE41-NEXT: pshufb %xmm1, %xmm0
569-
; SSE41-NEXT: pandn %xmm0, %xmm2
570-
; SSE41-NEXT: movdqa %xmm2, %xmm0
571567
; SSE41-NEXT: retq
572568
;
573569
; XOP-LABEL: var_shuffle_zero_v8i16:
@@ -576,8 +572,8 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
576572
; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1
577573
; XOP-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
578574
; XOP-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
575+
; XOP-NEXT: vpor %xmm2, %xmm1, %xmm1
579576
; XOP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
580-
; XOP-NEXT: vpandn %xmm0, %xmm2, %xmm0
581577
; XOP-NEXT: retq
582578
;
583579
; AVX1-LABEL: var_shuffle_zero_v8i16:
@@ -587,8 +583,8 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
587583
; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
588584
; AVX1-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
589585
; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
586+
; AVX1-NEXT: vpor %xmm2, %xmm1, %xmm1
590587
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
591-
; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0
592588
; AVX1-NEXT: retq
593589
;
594590
; AVX2-LABEL: var_shuffle_zero_v8i16:
@@ -598,8 +594,8 @@ define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounw
598594
; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
599595
; AVX2-NEXT: vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
600596
; AVX2-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
597+
; AVX2-NEXT: vpor %xmm2, %xmm1, %xmm1
601598
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
602-
; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
603599
; AVX2-NEXT: retq
604600
;
605601
; AVX512VL-LABEL: var_shuffle_zero_v8i16:
@@ -889,29 +885,24 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
889885
; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
890886
; SSSE3-NEXT: pmaxub %xmm1, %xmm2
891887
; SSSE3-NEXT: pcmpeqb %xmm1, %xmm2
892-
; SSSE3-NEXT: por %xmm2, %xmm1
893-
; SSSE3-NEXT: pshufb %xmm1, %xmm0
894-
; SSSE3-NEXT: pandn %xmm0, %xmm2
895-
; SSSE3-NEXT: movdqa %xmm2, %xmm0
888+
; SSSE3-NEXT: por %xmm1, %xmm2
889+
; SSSE3-NEXT: pshufb %xmm2, %xmm0
896890
; SSSE3-NEXT: retq
897891
;
898892
; SSE41-LABEL: var_shuffle_zero_v16i8:
899893
; SSE41: # %bb.0:
900894
; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
901895
; SSE41-NEXT: pmaxub %xmm1, %xmm2
902896
; SSE41-NEXT: pcmpeqb %xmm1, %xmm2
903-
; SSE41-NEXT: por %xmm2, %xmm1
904-
; SSE41-NEXT: pshufb %xmm1, %xmm0
905-
; SSE41-NEXT: pandn %xmm0, %xmm2
906-
; SSE41-NEXT: movdqa %xmm2, %xmm0
897+
; SSE41-NEXT: por %xmm1, %xmm2
898+
; SSE41-NEXT: pshufb %xmm2, %xmm0
907899
; SSE41-NEXT: retq
908900
;
909901
; XOP-LABEL: var_shuffle_zero_v16i8:
910902
; XOP: # %bb.0:
911903
; XOP-NEXT: vpcomgtub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
912904
; XOP-NEXT: vpor %xmm1, %xmm2, %xmm1
913905
; XOP-NEXT: vpshufb %xmm1, %xmm0, %xmm0
914-
; XOP-NEXT: vpandn %xmm0, %xmm2, %xmm0
915906
; XOP-NEXT: retq
916907
;
917908
; AVX1-LABEL: var_shuffle_zero_v16i8:
@@ -920,7 +911,6 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
920911
; AVX1-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
921912
; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
922913
; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0
923-
; AVX1-NEXT: vpandn %xmm0, %xmm2, %xmm0
924914
; AVX1-NEXT: retq
925915
;
926916
; AVX2-LABEL: var_shuffle_zero_v16i8:
@@ -929,7 +919,6 @@ define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounw
929919
; AVX2-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
930920
; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
931921
; AVX2-NEXT: vpshufb %xmm1, %xmm0, %xmm0
932-
; AVX2-NEXT: vpandn %xmm0, %xmm2, %xmm0
933922
; AVX2-NEXT: retq
934923
;
935924
; AVX512VL-LABEL: var_shuffle_zero_v16i8:
@@ -1289,9 +1278,8 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
12891278
; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
12901279
; SSSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
12911280
; SSSE3-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1281+
; SSSE3-NEXT: por %xmm2, %xmm1
12921282
; SSSE3-NEXT: pshufb %xmm1, %xmm0
1293-
; SSSE3-NEXT: pandn %xmm0, %xmm2
1294-
; SSSE3-NEXT: movdqa %xmm2, %xmm0
12951283
; SSSE3-NEXT: retq
12961284
;
12971285
; SSE41-LABEL: var_shuffle_zero_v4f32:
@@ -1302,9 +1290,8 @@ define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) n
13021290
; SSE41-NEXT: por %xmm2, %xmm1
13031291
; SSE41-NEXT: pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
13041292
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1293+
; SSE41-NEXT: por %xmm2, %xmm1
13051294
; SSE41-NEXT: pshufb %xmm1, %xmm0
1306-
; SSE41-NEXT: pandn %xmm0, %xmm2
1307-
; SSE41-NEXT: movdqa %xmm2, %xmm0
13081295
; SSE41-NEXT: retq
13091296
;
13101297
; XOP-LABEL: var_shuffle_zero_v4f32:

0 commit comments

Comments
 (0)