Skip to content

[X86] LowerShift - don't prematurely lower to x86 vector shift imm instructions #120282

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 18, 2024

Conversation

RKSimon
Copy link
Collaborator

@RKSimon RKSimon commented Dec 17, 2024

When splitting 2 unique amount shifts to shuffle(shift(x,c1),shift(x,c2)), don't use getTargetVShiftByConstNode directly to lower, use generic shifts to ensure we make use of any further canonicalization: shl(X,1) to add(X,X) etc. - this can have notably better throughput on some x86 targets.

Noticed on #120270

@llvmbot
Copy link
Member

llvmbot commented Dec 17, 2024

@llvm/pr-subscribers-backend-x86

Author: Simon Pilgrim (RKSimon)

Changes

When splitting 2 unique amount shifts to shuffle(shift(x,c1),shift(x,c2)), don't use getTargetVShiftByConstNode directly to lower, use generic shifts to ensure we make use of any further canonicalization: shl(X,1) to add(X,X) etc. - this can have notably better throughput on some x86 targets.

Noticed on #120270


Full diff: https://github.com/llvm/llvm-project/pull/120282.diff

5 Files Affected:

  • (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+4-4)
  • (modified) llvm/test/CodeGen/X86/combine-sdiv.ll (+8-8)
  • (modified) llvm/test/CodeGen/X86/lower-vec-shift.ll (+7-7)
  • (modified) llvm/test/CodeGen/X86/vec_shift6.ll (+2-2)
  • (modified) llvm/test/CodeGen/X86/widen_arith-4.ll (+1-1)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2571873dba8483..5b6c8a43453780 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30100,10 +30100,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
       auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2);
       if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) &&
           Cst2->getAPIntValue().ult(EltSizeInBits)) {
-        SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
-                                                    Cst1->getZExtValue(), DAG);
-        SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R,
-                                                    Cst2->getZExtValue(), DAG);
+        SDValue Shift1 = DAG.getNode(
+            Opc, dl, VT, R, DAG.getConstant(Cst1->getZExtValue(), dl, VT));
+        SDValue Shift2 = DAG.getNode(
+            Opc, dl, VT, R, DAG.getConstant(Cst2->getZExtValue(), dl, VT));
         return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask);
       }
     }
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 76f21604500482..42f09d04da26ed 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -2190,7 +2190,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; SSE41-NEXT:    pxor %xmm4, %xmm4
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    psllw $1, %xmm2
+; SSE41-NEXT:    paddw %xmm2, %xmm2
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4,5],xmm2[6],xmm4[7]
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
@@ -2202,9 +2202,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
 ; SSE41-NEXT:    psraw $8, %xmm0
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    psllw $1, %xmm3
-; SSE41-NEXT:    psllw $7, %xmm0
-; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5],xmm0[6],xmm3[7]
+; SSE41-NEXT:    psllw $7, %xmm3
+; SSE41-NEXT:    paddw %xmm0, %xmm0
+; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4],xmm0[5],xmm3[6],xmm0[7]
 ; SSE41-NEXT:    psrlw $8, %xmm0
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE41-NEXT:    psraw $8, %xmm2
@@ -2225,7 +2225,7 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpsllw $1, %xmm4, %xmm4
+; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm4[2],xmm3[3,4,5],xmm4[6],xmm3[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -2235,9 +2235,9 @@ define <16 x i8> @non_splat_minus_one_divisor_1(<16 x i8> %A) {
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm0, %xmm1
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
 ; AVX1-NEXT:    vpsraw $8, %xmm2, %xmm2
-; AVX1-NEXT:    vpsllw $1, %xmm2, %xmm3
-; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm3[5],xmm2[6],xmm3[7]
+; AVX1-NEXT:    vpsllw $7, %xmm2, %xmm3
+; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm2[5],xmm3[6],xmm2[7]
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/lower-vec-shift.ll b/llvm/test/CodeGen/X86/lower-vec-shift.ll
index 67e0c1b3cf2b3e..9d4935ef564def 100644
--- a/llvm/test/CodeGen/X86/lower-vec-shift.ll
+++ b/llvm/test/CodeGen/X86/lower-vec-shift.ll
@@ -265,11 +265,11 @@ define <16 x i16> @test11(<16 x i16> %a) {
 ; AVX1-LABEL: test11:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm2
-; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4,5],xmm1[6],xmm2[7]
+; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm2
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4,5],xmm2[6],xmm1[7]
 ; AVX1-NEXT:    vpsllw $3, %xmm0, %xmm2
-; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -294,10 +294,10 @@ define <16 x i16> @test12(<16 x i16> %a) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpsllw $3, %xmm1, %xmm2
-; AVX1-NEXT:    vpsllw $1, %xmm1, %xmm1
+; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2,3,4],xmm2[5,6,7]
 ; AVX1-NEXT:    vpsllw $3, %xmm0, %xmm2
-; AVX1-NEXT:    vpsllw $1, %xmm0, %xmm0
+; AVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3,4],xmm2[5,6,7]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
@@ -305,7 +305,7 @@ define <16 x i16> @test12(<16 x i16> %a) {
 ; AVX2-LABEL: test12:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpsllw $3, %ymm0, %ymm1
-; AVX2-NEXT:    vpsllw $1, %ymm0, %ymm0
+; AVX2-NEXT:    vpaddw %ymm0, %ymm0, %ymm0
 ; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5,6,7],ymm0[8],ymm1[9],ymm0[10,11,12],ymm1[13,14,15]
 ; AVX2-NEXT:    retq
   %lshr = shl <16 x i16> %a, <i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3, i16 1, i16 3, i16 1, i16 1, i16 1, i16 3, i16 3, i16 3>
diff --git a/llvm/test/CodeGen/X86/vec_shift6.ll b/llvm/test/CodeGen/X86/vec_shift6.ll
index 59bc3940fcb31e..5bb61240a1755e 100644
--- a/llvm/test/CodeGen/X86/vec_shift6.ll
+++ b/llvm/test/CodeGen/X86/vec_shift6.ll
@@ -68,14 +68,14 @@ define <4 x i32> @test4(<4 x i32> %a) {
 ; SSE2-LABEL: test4:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    pslld $1, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: test4:
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pslld $1, %xmm1
+; SSE41-NEXT:    paddd %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
 ; SSE41-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/widen_arith-4.ll b/llvm/test/CodeGen/X86/widen_arith-4.ll
index c49882ffe0b389..ea6bf66fd2923a 100644
--- a/llvm/test/CodeGen/X86/widen_arith-4.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-4.ll
@@ -65,7 +65,7 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
 ; SSE41-NEXT:    psubw %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psllw $2, %xmm2
-; SSE41-NEXT:    psllw $1, %xmm1
+; SSE41-NEXT:    paddw %xmm1, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1],xmm1[2,3,4,5,6,7]
 ; SSE41-NEXT:    pextrw $4, %xmm1, 8(%rcx,%rax)
 ; SSE41-NEXT:    movq %xmm2, (%rcx,%rax)

Comment on lines 30103 to 30106
SDValue Shift1 = DAG.getNode(
Opc, dl, VT, R, DAG.getConstant(Cst1->getZExtValue(), dl, VT));
SDValue Shift2 = DAG.getNode(
Opc, dl, VT, R, DAG.getConstant(Cst2->getZExtValue(), dl, VT));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this undef safe? Or we don't need to consider it?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can freeze R to account for this

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually - its OK as it is - the shuffle below means we don't reference the same element from both shifts

@RKSimon RKSimon force-pushed the x86-vector-shl1-add branch from 7794e94 to 7a92b8e Compare December 18, 2024 13:30
…structions

When splitting 2 unique amount shifts to shuffle(shift(x,c1),shift(x,c2)), don't use getTargetVShiftByConstNode directly to lower, use generic shifts to ensure we make use of any further canonicalization: shl(X,1) to add(X,X) etc. - this can have notably better throughput on some x86 targets.

Noticed on llvm#120270
@RKSimon RKSimon force-pushed the x86-vector-shl1-add branch from 7a92b8e to be51ea5 Compare December 18, 2024 13:42
Copy link
Contributor

@phoebewang phoebewang left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@RKSimon RKSimon merged commit 49fd2dd into llvm:main Dec 18, 2024
8 checks passed
@RKSimon RKSimon deleted the x86-vector-shl1-add branch December 18, 2024 16:08
@llvm-ci
Copy link
Collaborator

llvm-ci commented Dec 18, 2024

LLVM Buildbot has detected a new failure on builder lldb-remote-linux-ubuntu running on as-builder-9 while building llvm at step 16 "test-check-lldb-api".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/195/builds/2557

Here is the relevant piece of the build log for the reference
Step 16 (test-check-lldb-api) failure: Test just built components: check-lldb-api completed (failure)
******************** TEST 'lldb-api :: functionalities/thread/concurrent_events/TestConcurrentManySignals.py' FAILED ********************
Script:
--
/usr/bin/python3.12 /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/llvm-project/lldb/test/API/dotest.py -u CXXFLAGS -u CFLAGS --env LLVM_LIBS_DIR=/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/./lib --env LLVM_INCLUDE_DIR=/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/include --env LLVM_TOOLS_DIR=/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/./bin --libcxx-include-dir /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/include/c++/v1 --libcxx-include-target-dir /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/include/aarch64-unknown-linux-gnu/c++/v1 --libcxx-library-dir /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/./lib/aarch64-unknown-linux-gnu --arch aarch64 --build-dir /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/lldb-test-build.noindex --lldb-module-cache-dir /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/lldb-test-build.noindex/module-cache-lldb/lldb-api --clang-module-cache-dir /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/lldb-test-build.noindex/module-cache-clang/lldb-api --executable /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/./bin/lldb --compiler /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/bin/clang --dsymutil /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/./bin/dsymutil --make /usr/bin/gmake --llvm-tools-dir /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/./bin --lldb-obj-root /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/tools/lldb --lldb-libs-dir /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/./lib --platform-url connect://jetson-agx-2198.lab.llvm.org:1234 --platform-working-dir /home/ubuntu/lldb-tests --sysroot /mnt/fs/jetson-agx-ubuntu --env ARCH_CFLAGS=-mcpu=cortex-a78 --platform-name remote-linux /home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/llvm-project/lldb/test/API/functionalities/thread/concurrent_events -p TestConcurrentManySignals.py
--
Exit Code: 1

Command Output (stdout):
--
lldb version 20.0.0git (https://github.com/llvm/llvm-project.git revision 49fd2dde21655f95309abb17ad1d3392afe4985f)
  clang revision 49fd2dde21655f95309abb17ad1d3392afe4985f
  llvm revision 49fd2dde21655f95309abb17ad1d3392afe4985f
Setting up remote platform 'remote-linux'
Connecting to remote platform 'remote-linux' at 'connect://jetson-agx-2198.lab.llvm.org:1234'...
Connected.
Setting remote platform working directory to '/home/ubuntu/lldb-tests'...
Skipping the following test categories: ['dsym', 'gmodules', 'debugserver', 'objc', 'lldb-dap']

--
Command Output (stderr):
--
WARNING:root:Custom libc++ is not supported for remote runs: ignoring --libcxx arguments
FAIL: LLDB (/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/bin/clang-aarch64) :: test (TestConcurrentManySignals.ConcurrentManySignals.test)
======================================================================
FAIL: test (TestConcurrentManySignals.ConcurrentManySignals.test)
   Test 100 signals from 100 threads.
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/decorators.py", line 148, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/decorators.py", line 148, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/llvm-project/lldb/test/API/functionalities/thread/concurrent_events/TestConcurrentManySignals.py", line 17, in test
    self.do_thread_actions(num_signal_threads=100)
  File "/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/llvm-project/lldb/packages/Python/lldbsuite/test/concurrent_base.py", line 261, in do_thread_actions
    self.assertEqual(
AssertionError: 1 != 0 : Expected main thread (finish) breakpoint to be hit once
Config=aarch64-/home/buildbot/worker/as-builder-9/lldb-remote-linux-ubuntu/build/bin/clang
----------------------------------------------------------------------
Ran 1 test in 9.535s

FAILED (failures=1)

--

********************


Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants