-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[X86] LowerShift - don't prematurely lower to x86 vector shift imm instructions #120282
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-x86 Author: Simon Pilgrim (RKSimon) ChangesWhen splitting 2 unique amount shifts to shuffle(shift(x,c1),shift(x,c2)), don't use getTargetVShiftByConstNode directly to lower, use generic shifts to ensure we make use of any further canonicalization: shl(X,1) to add(X,X) etc. - this can have notably better throughput on some x86 targets. Noticed on #120270 Full diff: https://github.com/llvm/llvm-project/pull/120282.diff 5 Files Affected:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2571873dba8483..5b6c8a43453780 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30100,10 +30100,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
Cst2->getAPIntValue().ult(EltSizeInBits)) {
- SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
- Cst1->getZExtValue(), DAG);
- SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
- Cst2->getZExtValue(), DAG);
+ SDValue Shift1 = DAG.getNode(
+ Opc, dl, VT, R, DAG.getConstant(Cst1->getZExtValue(), dl, VT));
+ SDValue Shift2 = DAG.getNode(
+ Opc, dl, VT, R, DAG.getConstant(Cst2->getZExtValue(), dl, VT));
return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
}
}
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 76f21604500482..42f09d04da26ed 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2190,7 +2190,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE41-NEXT: pxor %xmm4, %xmm4
; SSE41-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT: psllw $1, %xmm2
+; SSE41-NEXT: paddw %xmm2, %xmm2
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
; SSE41-NEXT: psrlw $8, %xmm2
; SSE41-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
@@ -2202,9 +2202,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; SSE41-NEXT: psraw $8, %xmm0
; SSE41-NEXT: movdqa %xmm0, %xmm3
-; SSE41-NEXT: psllw $1, %xmm3
-; SSE41-NEXT: psllw $7, %xmm0
-; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
+; SSE41-NEXT: psllw $7, %xmm3
+; SSE41-NEXT: paddw %xmm0, %xmm0
+; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
; SSE41-NEXT: psrlw $8, %xmm0
; SSE41-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; SSE41-NEXT: psraw $8, %xmm2
@@ -2225,7 +2225,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX1-NEXT: vpcmpgtb %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT: vpsllw $1, %xmm4, %xmm4
+; AVX1-NEXT: vpaddw %xmm4, %xmm4, %xmm4
; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
; AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -2235,9 +2235,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; AVX1-NEXT: vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT: vpsllw $1, %xmm2, %xmm3
-; AVX1-NEXT: vpsllw $7, %xmm2, %xmm2
-; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
+; AVX1-NEXT: vpsllw $7, %xmm2, %xmm3
+; AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
; AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2
; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; AVX1-NEXT: vpsraw $8, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/lower-vec-shift.ll b/llvm/test/CodeGen/X86/lower-vec-shift.ll
index 67e0c1b3cf2b3e..9d4935ef564def 100644
--- a/llvm/test/CodeGen/X86/lower-vec-shift.ll
+++ b/llvm/test/CodeGen/X86/lower-vec-shift.ll
@@ -265,11 +265,11 @@ define <16 x i16> @test11(<16 x i16> %a) {
; AVX1-LABEL: test11:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpsllw $1, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $3, %xmm1, %xmm1
-; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
+; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6],xmm1[7]
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -294,10 +294,10 @@ define <16 x i16> @test12(<16 x i16> %a) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpsllw $3, %xmm1, %xmm2
-; AVX1-NEXT: vpsllw $1, %xmm1, %xmm1
+; AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1
; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vpsllw $3, %xmm0, %xmm2
-; AVX1-NEXT: vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT: vpaddw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
@@ -305,7 +305,7 @@ define <16 x i16> @test12(<16 x i16> %a) {
; AVX2-LABEL: test12:
; AVX2: # %bb.0:
; AVX2-NEXT: vpsllw $3, %ymm0, %ymm1
-; AVX2-NEXT: vpsllw $1, %ymm0, %ymm0
+; AVX2-NEXT: vpaddw %ymm0, %ymm0, %ymm0
; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
; AVX2-NEXT: retq
%lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll
index 59bc3940fcb31e..5bb61240a1755e 100644
--- a/llvm/test/CodeGen/X86/vec_shift6.ll
+++ b/llvm/test/CodeGen/X86/vec_shift6.ll
@@ -68,14 +68,14 @@ define <4 x i32> @test4(<4 x i32> %a) {
; SSE2-LABEL: test4:
; SSE2: # %bb.0:
; SSE2-NEXT: movdqa %xmm0, %xmm1
-; SSE2-NEXT: pslld $1, %xmm1
+; SSE2-NEXT: paddd %xmm0, %xmm1
; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
; SSE2-NEXT: retq
;
; SSE41-LABEL: test4:
; SSE41: # %bb.0:
; SSE41-NEXT: movdqa %xmm0, %xmm1
-; SSE41-NEXT: pslld $1, %xmm1
+; SSE41-NEXT: paddd %xmm0, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
; SSE41-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/widen_arith-4.ll b/llvm/test/CodeGen/X86/widen_arith-4.ll
index c49882ffe0b389..ea6bf66fd2923a 100644
--- a/llvm/test/CodeGen/X86/widen_arith-4.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-4.ll
@@ -65,7 +65,7 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
; SSE41-NEXT: psubw %xmm0, %xmm1
; SSE41-NEXT: movdqa %xmm1, %xmm2
; SSE41-NEXT: psllw $2, %xmm2
-; SSE41-NEXT: psllw $1, %xmm1
+; SSE41-NEXT: paddw %xmm1, %xmm1
; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
; SSE41-NEXT: pextrw $4, %xmm1, 8(%rcx,%rax)
; SSE41-NEXT: movq %xmm2, (%rcx,%rax)
|
SDValue Shift1 = DAG.getNode( | ||
Opc, dl, VT, R, DAG.getConstant(Cst1->getZExtValue(), dl, VT)); | ||
SDValue Shift2 = DAG.getNode( | ||
Opc, dl, VT, R, DAG.getConstant(Cst2->getZExtValue(), dl, VT)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this undef safe? Or we don't need to consider it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I can freeze R to account for this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually - its OK as it is - the shuffle below means we don't reference the same element from both shifts
7794e94
to
7a92b8e
Compare
…structions When splitting 2 unique amount shifts to shuffle(shift(x,c1),shift(x,c2)), don't use getTargetVShiftByConstNode directly to lower, use generic shifts to ensure we make use of any further canonicalization: shl(X,1) to add(X,X) etc. - this can have notably better throughput on some x86 targets. Noticed on llvm#120270
7a92b8e
to
be51ea5
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/195/builds/2557 Here is the relevant piece of the build log for the reference
|
When splitting 2 unique amount shifts to shuffle(shift(x,c1),shift(x,c2)), don't use getTargetVShiftByConstNode directly to lower, use generic shifts to ensure we make use of any further canonicalization: shl(X,1) to add(X,X) etc. - this can have notably better throughput on some x86 targets.
Noticed on #120270