[X86] Invert (and X, ~(and ~Y, Z)) back into (and X, (or Y, ~Z)) #109215

Saldivarcher · 2024-09-18T22:55:19Z

When andn is available, we should avoid switching s &= ~(z & ~y); into s &= ~z | y;

This patch turns this assembly from:

foo:
        not     rcx
        and     rsi, rdx
        andn    rax, rsi, rdi
        or      rcx, rdx
        and     rax, rcx
        ret

into:

foo:
        and     rsi, rdx
        andn    rcx, rdx, rcx
        andn    rax, rsi, rdi
        andn    rax, rcx, rax
        ret

Fixes #108731

Saldivarcher · 2024-09-18T23:01:15Z

@RKSimon I wasn't sure if this was the correct approach for this, but this is the direction I went in.

Saldivarcher · 2024-09-18T23:03:43Z

Here is what the assembly looks like with my changes:

$ cat repro.c 
#include <stdint.h>

int64_t foo(int64_t w, int64_t x, int64_t y, int64_t z) {
  int64_t s = w;
  s &= ~(x & y);
  s &= ~(z & ~y);
  return s;
}

typedef uint8_t vec16u8 __attribute__((vector_size(16)));

vec16u8 fooVec(vec16u8 w, vec16u8 x, vec16u8 y, vec16u8 z) {
  vec16u8 s = w;
  s &= ~(x & y);
  s &= ~(z & ~y);
  return s;
}
$ ~/workspace/work/llvm-project.git/build_debug/bin/clang repro.c -O3 -S -masm=intel -march=znver3
$ cat repro.s
        .text
        .intel_syntax noprefix
        .file   "repro.c"
        .globl  foo                             # -- Begin function foo
        .p2align        4, 0x90
        .type   foo,@function
foo:                                    # @foo
        .cfi_startproc
# %bb.0:                                # %entry
        and     rsi, rdx
        andn    rcx, rdx, rcx
        andn    rax, rsi, rdi
        andn    rax, rcx, rax
        ret
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
        .cfi_endproc
                                        # -- End function
        .globl  fooVec                          # -- Begin function fooVec
        .p2align        4, 0x90
        .type   fooVec,@function
fooVec:                                 # @fooVec
        .cfi_startproc
# %bb.0:                                # %entry
        vandps  xmm1, xmm2, xmm1
        vandnps xmm2, xmm2, xmm3
        vandnps xmm0, xmm1, xmm0
        vandnps xmm0, xmm2, xmm0
        ret
.Lfunc_end1:
        .size   fooVec, .Lfunc_end1-fooVec
        .cfi_endproc
                                        # -- End function
        .ident  "clang version 20.0.0git ([email protected]:llvm/llvm-project.git f00c946c2da0caf6da4a49e87ac905a8b1d2e8b6)"
        .section        ".note.GNU-stack","",@progbits
        .addrsig

topperc · 2024-09-18T23:09:42Z

InstCombine generally does not take subtarget aware optimizations like this. A lot of what InstCombine does is considered "canonicalization".

topperc · 2024-09-18T23:15:04Z

Preventing the transform in InstCombine only solves the problem if the user was aware of the existence of andn and wrote their code as s &= ~(z & ~y). A user unaware of andn might write s &= ~z | y in their code because it looks simpler.

RKSimon · 2024-09-19T09:12:46Z

@Saldivarcher This needs to be done in DAGCombine, not InstCombine, to allow us to properly account for a target's support for ANDNOT instructions. I did an initial test yesterday, which hit infinite loops in VE and AArch64 targets, so to initially address #108731 we could just handle this in X86ISelLowering.

This is what I tried as an initial investigation inside DAGCombiner::visitAND:

  // Fold (and X, (or Y, ~Z)) -> (and X, ~(and ~Y, Z))
  if (TLI.hasAndNot(SDValue(N, 0))) {
    SDValue X, Y, Z;
    if (sd_match(N, m_And(m_Value(X),
                          m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z)))))))
      return DAG.getNode(
          ISD::AND, DL, VT, X,
          DAG.getNOT(
              DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));

You could try just moving it to X86's combineAnd and replace the hasAndNot

nikic · 2024-09-19T10:33:02Z

Landed f1ff3a2 to hopefully save the next person some time.

Saldivarcher · 2024-09-20T06:01:39Z

@topperc and @RKSimon thanks for the help and explanations, they were extremely useful.

@nikic sorry you had to make the changes because of me! Those were good changes tho 👍

Saldivarcher · 2024-09-20T06:05:51Z

@RKSimon I did pretty much what you suggested and get the results we expect:

$ cat to-optimize.ll 
define dso_local i64 @foo(i64 %0, i64 %1, i64 %2, i64 %3) local_unnamed_addr {
Entry:
  %4 = and i64 %2, %1
  %5 = xor i64 %4, -1
  %6 = and i64 %5, %0
  %.not = xor i64 %3, -1
  %7 = or i64 %.not, %2
  %8 = and i64 %6, %7
  ret i64 %8
}

declare void @llvm.dbg.value(metadata, metadata, metadata) #1

define dso_local <16 x i8> @fooVec(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3) local_unnamed_addr {
Entry:
  %4 = and <16 x i8> %2, %1
  %5 = xor <16 x i8> %4, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %6 = and <16 x i8> %5, %0
  %.not = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %7 = or <16 x i8> %.not, %2
  %8 = and <16 x i8> %6, %7
  ret <16 x i8> %8
}
$ ~/workspace/work/llvm-project.git/build_debug/bin/llc to-optimize.ll -o out.s --x86-asm-syntax=intel
$ cat out.s
        .text
        .intel_syntax noprefix
        .file   "to-optimize.ll"
        .globl  foo                             # -- Begin function foo
        .p2align        4, 0x90
        .type   foo,@function
foo:                                    # @foo
        .cfi_startproc
# %bb.0:                                # %Entry
        mov     rax, rcx
        and     rsi, rdx
        not     rsi
        and     rsi, rdi
        not     rax
        or      rax, rdx
        and     rax, rsi
        ret
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
        .cfi_endproc
                                        # -- End function
        .globl  fooVec                          # -- Begin function fooVec
        .p2align        4, 0x90
        .type   fooVec,@function
fooVec:                                 # @fooVec
        .cfi_startproc
# %bb.0:                                # %Entry
        andps   xmm1, xmm2
        andnps  xmm1, xmm0
        andnps  xmm2, xmm3
        andnps  xmm2, xmm1
        movaps  xmm0, xmm2
        ret
.Lfunc_end1:
        .size   fooVec, .Lfunc_end1-fooVec
        .cfi_endproc
                                        # -- End function
        .section        ".note.GNU-stack","",@progbits
$ ~/workspace/work/llvm-project.git/build_debug/bin/llc to-optimize.ll -o out.s --x86-asm-syntax=intel -mcpu=znver3
$ cat out.s
        .text
        .intel_syntax noprefix
        .file   "to-optimize.ll"
        .globl  foo                             # -- Begin function foo
        .p2align        4, 0x90
        .type   foo,@function
foo:                                    # @foo
        .cfi_startproc
# %bb.0:                                # %Entry
        and     rsi, rdx
        andn    rcx, rdx, rcx
        andn    rax, rsi, rdi
        andn    rax, rcx, rax
        ret
.Lfunc_end0:
        .size   foo, .Lfunc_end0-foo
        .cfi_endproc
                                        # -- End function
        .globl  fooVec                          # -- Begin function fooVec
        .p2align        4, 0x90
        .type   fooVec,@function
fooVec:                                 # @fooVec
        .cfi_startproc
# %bb.0:                                # %Entry
        vandps  xmm1, xmm2, xmm1
        vandnps xmm2, xmm2, xmm3
        vandnps xmm0, xmm1, xmm0
        vandnps xmm0, xmm2, xmm0
        ret
.Lfunc_end1:
        .size   fooVec, .Lfunc_end1-fooVec
        .cfi_endproc
                                        # -- End function
        .section        ".note.GNU-stack","",@progbits

llvmbot · 2024-09-20T18:49:58Z

@llvm/pr-subscribers-backend-x86

Author: Miguel Saldivar (Saldivarcher)

Changes

When andn is available, we should avoid switching s &= ~(z & ~y); into s &= ~z | y;

This patch turns this assembly from:

foo:
        not     rcx
        and     rsi, rdx
        andn    rax, rsi, rdi
        or      rcx, rdx
        and     rax, rcx
        ret

into:

foo:
        and     rsi, rdx
        andn    rcx, rdx, rcx
        andn    rax, rsi, rdi
        andn    rax, rcx, rax
        ret

Full diff: https://github.com/llvm/llvm-project/pull/109215.diff

3 Files Affected:

(modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+29)
(modified) llvm/test/CodeGen/X86/avx512vl-logic.ll (+1-1)
(added) llvm/test/CodeGen/X86/pr108731.ll (+40)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index c5dc3ea17f72c3..ed3b5c551bd69d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -50034,6 +50034,32 @@ static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
          (VT == MVT::i32 || (VT == MVT::i64 && Subtarget.is64Bit()));
 }
 
+/// InstCombine converts:
+///    `(and X, ~(and ~Y, Z))`
+/// to
+///    `(and X, (or Y, ~Z))`
+///
+/// But we should undo this transformation if the `andn` instruction is
+/// available to us.
+static SDValue combineAndNotOrIntoAndNotAnd(SDNode *N, SelectionDAG &DAG,
+                                            const X86Subtarget &Subtarget) {
+
+  using namespace llvm::SDPatternMatch;
+  MVT VT = N->getSimpleValueType(0);
+  SDLoc DL(N);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.hasAndNot(SDValue(N, 0))) {
+    SDValue X, Y, Z;
+    if (sd_match(N, m_And(m_Value(X),
+                          m_OneUse(m_Or(m_Value(Y), m_Not(m_Value(Z)))))))
+      return DAG.getNode(
+          ISD::AND, DL, VT, X,
+          DAG.getNOT(
+              DL, DAG.getNode(ISD::AND, DL, VT, DAG.getNOT(DL, Y, VT), Z), VT));
+  }
+  return SDValue();
+}
+
 // This function recognizes cases where X86 bzhi instruction can replace and
 // 'and-load' sequence.
 // In case of loading integer value from an array of constants which is defined
@@ -50531,6 +50557,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
     return R;
 
+  if (SDValue R = combineAndNotOrIntoAndNotAnd(N, DAG, Subtarget))
+    return R;
+
   // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2))
   // iff c2 is all/no bits mask - i.e. a select-with-zero mask.
   // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW?
diff --git a/llvm/test/CodeGen/X86/avx512vl-logic.ll b/llvm/test/CodeGen/X86/avx512vl-logic.ll
index 58621967e2aca6..ea9d785ed88ac2 100644
--- a/llvm/test/CodeGen/X86/avx512vl-logic.ll
+++ b/llvm/test/CodeGen/X86/avx512vl-logic.ll
@@ -980,7 +980,7 @@ define <4 x i32> @ternlog_or_andn(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 define <4 x i32> @ternlog_and_orn(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
 ; CHECK-LABEL: ternlog_and_orn:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vpternlogd $176, %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    vpternlogd $208, %xmm2, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %a = xor <4 x i32> %z, <i32 -1, i32 -1, i32 -1, i32 -1>
   %b = or <4 x i32> %a, %y
diff --git a/llvm/test/CodeGen/X86/pr108731.ll b/llvm/test/CodeGen/X86/pr108731.ll
new file mode 100644
index 00000000000000..6022b068536935
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr108731.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-gnu-unknown -mcpu=znver3 | FileCheck %s
+
+define dso_local i64 @foo(i64 %0, i64 %1, i64 %2, i64 %3) local_unnamed_addr {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %Entry
+; CHECK-NEXT:    andq %rdx, %rsi
+; CHECK-NEXT:    andnq %rcx, %rdx, %rcx
+; CHECK-NEXT:    andnq %rdi, %rsi, %rax
+; CHECK-NEXT:    andnq %rax, %rcx, %rax
+; CHECK-NEXT:    retq
+Entry:
+  %4 = and i64 %2, %1
+  %5 = xor i64 %4, -1
+  %6 = and i64 %5, %0
+  %.not = xor i64 %3, -1
+  %7 = or i64 %.not, %2
+  %8 = and i64 %6, %7
+  ret i64 %8
+}
+
+declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+
+define dso_local <16 x i8> @fooVec(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2, <16 x i8> %3) local_unnamed_addr {
+; CHECK-LABEL: fooVec:
+; CHECK:       # %bb.0: # %Entry
+; CHECK-NEXT:    vandps %xmm1, %xmm2, %xmm1
+; CHECK-NEXT:    vandnps %xmm3, %xmm2, %xmm2
+; CHECK-NEXT:    vandnps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vandnps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+Entry:
+  %4 = and <16 x i8> %2, %1
+  %5 = xor <16 x i8> %4, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %6 = and <16 x i8> %5, %0
+  %.not = xor <16 x i8> %3, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %7 = or <16 x i8> %.not, %2
+  %8 = and <16 x i8> %6, %7
+  ret <16 x i8> %8
+}

Saldivarcher · 2024-10-03T22:15:04Z

@RKSimon ping!

RKSimon · 2024-10-04T12:37:15Z

@Saldivarcher sorry this fell off my review list as I wasn't a listed reviewer - will take another look soon

llvm/lib/Target/X86/X86ISelLowering.cpp

llvm/test/CodeGen/X86/avx512vl-logic.ll

llvm/test/CodeGen/X86/pr108731.ll

RKSimon

LGTM - cheers

The reason for this inversion is to utilize the `andn` instruction, which in turn produces less assembly code. This is the assembly we produced previously: ``` not rcx and rsi, rdx andn rax, rsi, rdi or rcx, rdx and rax, rcx ret ``` The assembly with the inversion: ``` and rsi, rdx andn rcx, rdx, rcx andn rax, rsi, rdi andn rax, rcx, rax ret ```

Saldivarcher · 2024-10-11T17:48:18Z

@RKSimon can you rebase and merge the PR for me, I don't have write access. Thank you!

…m#109215) When `andn` is available, we should avoid switching `s &= ~(z & ~y);` into `s &= ~z | y;` This patch turns this assembly from: ``` foo: not rcx and rsi, rdx andn rax, rsi, rdi or rcx, rdx and rax, rcx ret ``` into: ``` foo: and rsi, rdx andn rcx, rdx, rcx andn rax, rsi, rdi andn rax, rcx, rax ret ``` Fixes llvm#108731

bgra8 · 2024-10-21T09:00:22Z

@Saldivarcher this patch is causing a huge amount (hundreds) of breakages in our codebase at google. (the compiler enters an infinite loop and never finishes)

Here's a reproducer:

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
target triple = "x86_64-generic-linux-gnu"

define void @_renamed(i64 %error_code.coerce) #0 {
entry:
  %0 = and i64 %error_code.coerce, 4294967296
  %1 = xor i64 %0, -1
  %2 = and i64 %error_code.coerce, %1
  %3 = icmp eq i64 %2, 0
  br i1 %3, label %5, label %4

4:                                                ; preds = %entry
  unreachable

5:                                                ; preds = %entry
  ret void
}

attributes #0 = { "target-cpu"="haswell" }

This compiles in under a second with the previous version and never seems to finish with this patch applied.

Can you please revert while investigating?

…Z)) (llvm#109215)" This reverts commit 6fd229a.

Saldivarcher · 2024-10-21T21:36:30Z

@bgra8 actually it looks like this commit fixes the issue:

commit ec78f0da0e9b1b8e2b2323e434ea742e272dd913
Author: Simon Pilgrim <[email protected]>
Date:   Tue Oct 15 16:54:08 2024 +0100

    [X86] combineAndNotOrIntoAndNotAnd - don't attempt with constant operands
    
    Don't fold AND(X,OR(NOT(Z),C)) -> AND(X,NOT(AND(Z,C'))) as DAGCombiner will invert it back again.
    
    Fixes #112347

Can you give it a try?

dyung · 2024-10-22T00:19:25Z

@Saldivarcher we are also seeing compiler timeouts on an internal test which I bisected back to your commit which ec78f0d does not fix. I have filed the details in #113240, can you take a look?

Saldivarcher · 2024-10-22T05:15:51Z

#113264 also fixes some infinite loops.

bgra8 · 2024-10-22T09:13:01Z

Commit ec78f0d does not help for most of the failing targets on our side.

But #113264 fixes all targets we've identified so far.

Saldivarcher mentioned this pull request Sep 18, 2024

[x86-64 BMI1] AND-NOT's should be preferred to OR-NOT's because we have andn/vpandn instructions #108731

Closed

Saldivarcher force-pushed the pr108731 branch from 6621234 to a283607 Compare September 20, 2024 05:59

Saldivarcher changed the title ~~[InstCombine] Avoid DeMorgan's on occasion~~ [X86] Invert (and X, ~(and ~Y, Z)) back into (and X, (or Y, ~Z)) Sep 20, 2024

Saldivarcher force-pushed the pr108731 branch from a283607 to 71fe983 Compare September 20, 2024 16:32

Saldivarcher marked this pull request as ready for review September 20, 2024 18:49

llvmbot added the backend:X86 label Sep 20, 2024

RKSimon self-requested a review October 4, 2024 12:36

RKSimon reviewed Oct 9, 2024

View reviewed changes

RKSimon requested review from phoebewang and goldsteinn October 9, 2024 21:40

Saldivarcher force-pushed the pr108731 branch from 71fe983 to 9b66ebf Compare October 10, 2024 18:25

RKSimon approved these changes Oct 11, 2024

View reviewed changes

Saldivarcher force-pushed the pr108731 branch from 547e3a5 to 9b66ebf Compare October 11, 2024 17:46

Saldivarcher force-pushed the pr108731 branch from 9b66ebf to 88588c3 Compare October 11, 2024 17:47

RKSimon merged commit 6fd229a into llvm:main Oct 12, 2024
8 checks passed

Saldivarcher added a commit to Saldivarcher/llvm-project that referenced this pull request Oct 21, 2024

Revert "[X86] Invert (and X, ~(and ~Y, Z)) back into (and X, (or Y, ~…

fe3a259

…Z)) (llvm#109215)" This reverts commit 6fd229a.

Saldivarcher added a commit to Saldivarcher/llvm-project that referenced this pull request Oct 21, 2024

Revert "[X86] Invert (and X, ~(and ~Y, Z)) back into (and X, (or Y, ~…

afb5c72

…Z)) (llvm#109215)" This reverts commit 6fd229a.

dyung mentioned this pull request Oct 22, 2024

Compiler in infinite loop after 6fd229a #113240

Closed

[X86] Invert (and X, ~(and ~Y, Z)) back into (and X, (or Y, ~Z)) #109215

[X86] Invert (and X, ~(and ~Y, Z)) back into (and X, (or Y, ~Z)) #109215

Uh oh!

Conversation

Saldivarcher commented Sep 18, 2024 • edited by RKSimon Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Saldivarcher commented Sep 18, 2024

Uh oh!

Saldivarcher commented Sep 18, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

topperc commented Sep 18, 2024

Uh oh!

topperc commented Sep 18, 2024

Uh oh!

RKSimon commented Sep 19, 2024

Uh oh!

nikic commented Sep 19, 2024

Uh oh!

Saldivarcher commented Sep 20, 2024

Uh oh!

Saldivarcher commented Sep 20, 2024

Uh oh!

llvmbot commented Sep 20, 2024

Uh oh!

Saldivarcher commented Oct 3, 2024

Uh oh!

RKSimon commented Oct 4, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Saldivarcher commented Oct 11, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

bgra8 commented Oct 21, 2024

Uh oh!

Saldivarcher commented Oct 21, 2024

Uh oh!

dyung commented Oct 22, 2024

Uh oh!

Saldivarcher commented Oct 22, 2024

Uh oh!

bgra8 commented Oct 22, 2024

Uh oh!

Uh oh!

Saldivarcher commented Sep 18, 2024 •

edited by RKSimon

Loading

Saldivarcher commented Sep 18, 2024 •

edited

Loading

Saldivarcher commented Oct 11, 2024 •

edited

Loading