[InstCombine] Fold bitwise logic with intrinsics #77460

dtcxzyw · 2024-01-09T12:57:35Z

This patch does the following folds:

bitwise(fshl (A, B, ShAmt), fshl(C, D, ShAmt)) -> fshl(bitwise(A, C), bitwise(B, D), ShAmt)
bitwise(fshr (A, B, ShAmt), fshr(C, D, ShAmt)) -> fshr(bitwise(A, C), bitwise(B, D), ShAmt)
bitwise(bswap(A), bswap(B)) -> bswap(bitwise(A, B))
bitwise(bswap(A), C) -> bswap(bitwise(A, bswap(C)))
bitwise(bitreverse(A), bitreverse(B)) -> bitreverse(bitwise(A, B))
bitwise(bitreverse(A), C) -> bitreverse(bitwise(A, bitreverse(C)))

Alive2: https://alive2.llvm.org/ce/z/iZN_TL

PR Link: llvm/llvm-project#77460

llvmbot · 2024-01-09T16:19:56Z

@llvm/pr-subscribers-llvm-transforms

Author: Yingwei Zheng (dtcxzyw)

Changes

Alive2: https://alive2.llvm.org/ce/z/S28Y3G

Full diff: https://github.com/llvm/llvm-project/pull/77460.diff

2 Files Affected:

(modified) llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp (+38)
(added) llvm/test/Transforms/InstCombine/bitwiselogic-funnelshift.ll (+133)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index c03f50d75814d8..6df899e2069089 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2159,6 +2159,35 @@ Instruction *InstCombinerImpl::foldBinOpOfDisplacedShifts(BinaryOperator &I) {
   return BinaryOperator::Create(ShiftOp, NewC, ShAmt);
 }
 
+static Instruction *
+foldBitwiseLogicWithFunnelShift(BinaryOperator &I,
+                                InstCombiner::BuilderTy &Builder) {
+  assert(I.isBitwiseLogicOp() && "Should and/or/xor");
+  Value *X = I.getOperand(0);
+  Value *Y = I.getOperand(1);
+  Value *Op0, *Op1, *Op2, *Op3, *ShAmt;
+  if (match(X, m_OneUse(m_FShl(m_Value(Op0), m_Value(Op1), m_Value(ShAmt)))) &&
+      match(Y,
+            m_OneUse(m_FShl(m_Value(Op2), m_Value(Op3), m_Specific(ShAmt))))) {
+    Value *NewOp0 = Builder.CreateBinOp(I.getOpcode(), Op0, Op2);
+    Value *NewOp1 = Builder.CreateBinOp(I.getOpcode(), Op1, Op3);
+    Function *F =
+        Intrinsic::getDeclaration(I.getModule(), Intrinsic::fshl, I.getType());
+    return CallInst::Create(F, {NewOp0, NewOp1, ShAmt});
+  }
+  if (match(X, m_OneUse(m_FShr(m_Value(Op0), m_Value(Op1), m_Value(ShAmt)))) &&
+      match(Y,
+            m_OneUse(m_FShr(m_Value(Op2), m_Value(Op3), m_Specific(ShAmt))))) {
+    Value *NewOp0 = Builder.CreateBinOp(I.getOpcode(), Op0, Op2);
+    Value *NewOp1 = Builder.CreateBinOp(I.getOpcode(), Op1, Op3);
+    Function *F =
+        Intrinsic::getDeclaration(I.getModule(), Intrinsic::fshr, I.getType());
+    return CallInst::Create(F, {NewOp0, NewOp1, ShAmt});
+  }
+
+  return nullptr;
+}
+
 // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
 // here. We should standardize that construct where it is needed or choose some
 // other way to ensure that commutated variants of patterns are not missed.
@@ -2688,6 +2717,9 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
   if (Instruction *Res = foldBinOpOfDisplacedShifts(I))
     return Res;
 
+  if (Instruction *Res = foldBitwiseLogicWithFunnelShift(I, Builder))
+    return Res;
+
   return nullptr;
 }
 
@@ -3884,6 +3916,9 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, *C1 | *C2));
   }
 
+  if (Instruction *Res = foldBitwiseLogicWithFunnelShift(I, Builder))
+    return Res;
+
   return nullptr;
 }
 
@@ -4799,5 +4834,8 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
   if (Instruction *Res = foldBinOpOfDisplacedShifts(I))
     return Res;
 
+  if (Instruction *Res = foldBitwiseLogicWithFunnelShift(I, Builder))
+    return Res;
+
   return nullptr;
 }
diff --git a/llvm/test/Transforms/InstCombine/bitwiselogic-funnelshift.ll b/llvm/test/Transforms/InstCombine/bitwiselogic-funnelshift.ll
new file mode 100644
index 00000000000000..31d82a53b38009
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/bitwiselogic-funnelshift.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+define i32 @test_or_fshl(i32 %a, i32 %b, i32 %c, i32 %d, i32 %sh) {
+; CHECK-LABEL: define i32 @test_or_fshl(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]], i32 [[SH:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[A]], [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[B]], [[D]]
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP2]], i32 [[SH]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %xor1 = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %sh)
+  %xor2 = call i32 @llvm.fshl.i32(i32 %c, i32 %d, i32 %sh)
+  %ret = or i32 %xor1, %xor2
+  ret i32 %ret
+}
+define i32 @test_and_fshl(i32 %a, i32 %b, i32 %c, i32 %d, i32 %sh) {
+; CHECK-LABEL: define i32 @test_and_fshl(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]], i32 [[SH:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[A]], [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[B]], [[D]]
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP2]], i32 [[SH]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %xor1 = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %sh)
+  %xor2 = call i32 @llvm.fshl.i32(i32 %c, i32 %d, i32 %sh)
+  %ret = and i32 %xor1, %xor2
+  ret i32 %ret
+}
+define i32 @test_xor_fshl(i32 %a, i32 %b, i32 %c, i32 %d, i32 %sh) {
+; CHECK-LABEL: define i32 @test_xor_fshl(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]], i32 [[SH:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[A]], [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[B]], [[D]]
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP2]], i32 [[SH]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %xor1 = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %sh)
+  %xor2 = call i32 @llvm.fshl.i32(i32 %c, i32 %d, i32 %sh)
+  %ret = xor i32 %xor1, %xor2
+  ret i32 %ret
+}
+define i32 @test_or_fshr(i32 %a, i32 %b, i32 %c, i32 %d, i32 %sh) {
+; CHECK-LABEL: define i32 @test_or_fshr(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]], i32 [[SH:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[A]], [[C]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[B]], [[D]]
+; CHECK-NEXT:    [[RET:%.*]] = call i32 @llvm.fshr.i32(i32 [[TMP1]], i32 [[TMP2]], i32 [[SH]])
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %xor1 = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %sh)
+  %xor2 = call i32 @llvm.fshr.i32(i32 %c, i32 %d, i32 %sh)
+  %ret = or i32 %xor1, %xor2
+  ret i32 %ret
+}
+define i32 @test_or_fshl_cascade(i32 %a, i32 %b, i32 %c) {
+; CHECK-LABEL: define i32 @test_or_fshl_cascade(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[A]], [[B]]
+; CHECK-NEXT:    [[TMP2:%.*]] = or i32 [[A]], [[B]]
+; CHECK-NEXT:    [[TMP3:%.*]] = or i32 [[TMP1]], [[C]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[C]]
+; CHECK-NEXT:    [[OR2:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP3]], i32 [[TMP4]], i32 24)
+; CHECK-NEXT:    ret i32 [[OR2]]
+;
+  %fshl1 = call i32 @llvm.fshl.i32(i32 %a, i32 %a, i32 24)
+  %fshl2 = call i32 @llvm.fshl.i32(i32 %b, i32 %b, i32 24)
+  %fshl3 = call i32 @llvm.fshl.i32(i32 %c, i32 %c, i32 24)
+  %or1 = or i32 %fshl1, %fshl2
+  %or2 = or i32 %or1, %fshl3
+  ret i32 %or2
+}
+
+; Negative tests
+
+define i32 @test_or_fshl_fshr(i32 %a, i32 %b, i32 %c, i32 %d, i32 %sh) {
+; CHECK-LABEL: define i32 @test_or_fshl_fshr(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]], i32 [[SH:%.*]]) {
+; CHECK-NEXT:    [[XOR1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A]], i32 [[B]], i32 [[SH]])
+; CHECK-NEXT:    [[XOR2:%.*]] = call i32 @llvm.fshr.i32(i32 [[C]], i32 [[D]], i32 [[SH]])
+; CHECK-NEXT:    [[RET:%.*]] = or i32 [[XOR1]], [[XOR2]]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %xor1 = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %sh)
+  %xor2 = call i32 @llvm.fshr.i32(i32 %c, i32 %d, i32 %sh)
+  %ret = or i32 %xor1, %xor2
+  ret i32 %ret
+}
+define i32 @test_or_fshl_mismatched_shamt(i32 %a, i32 %b, i32 %c, i32 %d, i32 %sh1, i32 %sh2) {
+; CHECK-LABEL: define i32 @test_or_fshl_mismatched_shamt(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]], i32 [[SH1:%.*]], i32 [[SH2:%.*]]) {
+; CHECK-NEXT:    [[XOR1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A]], i32 [[B]], i32 [[SH1]])
+; CHECK-NEXT:    [[XOR2:%.*]] = call i32 @llvm.fshl.i32(i32 [[C]], i32 [[D]], i32 [[SH2]])
+; CHECK-NEXT:    [[RET:%.*]] = or i32 [[XOR1]], [[XOR2]]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %xor1 = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %sh1)
+  %xor2 = call i32 @llvm.fshl.i32(i32 %c, i32 %d, i32 %sh2)
+  %ret = or i32 %xor1, %xor2
+  ret i32 %ret
+}
+define i32 @test_add_fshl(i32 %a, i32 %b, i32 %c, i32 %d, i32 %sh) {
+; CHECK-LABEL: define i32 @test_add_fshl(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]], i32 [[SH:%.*]]) {
+; CHECK-NEXT:    [[XOR1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A]], i32 [[B]], i32 [[SH]])
+; CHECK-NEXT:    [[XOR2:%.*]] = call i32 @llvm.fshl.i32(i32 [[C]], i32 [[D]], i32 [[SH]])
+; CHECK-NEXT:    [[RET:%.*]] = add i32 [[XOR1]], [[XOR2]]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %xor1 = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %sh)
+  %xor2 = call i32 @llvm.fshl.i32(i32 %c, i32 %d, i32 %sh)
+  %ret = add i32 %xor1, %xor2
+  ret i32 %ret
+}
+define i32 @test_or_fshl_multiuse(i32 %a, i32 %b, i32 %c, i32 %d, i32 %sh) {
+; CHECK-LABEL: define i32 @test_or_fshl_multiuse(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C:%.*]], i32 [[D:%.*]], i32 [[SH:%.*]]) {
+; CHECK-NEXT:    [[XOR1:%.*]] = call i32 @llvm.fshl.i32(i32 [[A]], i32 [[B]], i32 [[SH]])
+; CHECK-NEXT:    call void @use(i32 [[XOR1]])
+; CHECK-NEXT:    [[XOR2:%.*]] = call i32 @llvm.fshl.i32(i32 [[C]], i32 [[D]], i32 [[SH]])
+; CHECK-NEXT:    [[RET:%.*]] = or i32 [[XOR1]], [[XOR2]]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+  %xor1 = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %sh)
+  call void @use(i32 %xor1)
+  %xor2 = call i32 @llvm.fshl.i32(i32 %c, i32 %d, i32 %sh)
+  %ret = or i32 %xor1, %xor2
+  ret i32 %ret
+}
+
+declare void @use(i32)
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare i32 @llvm.fshr.i32(i32, i32, i32)

PR Link: llvm/llvm-project#77460

nikic

This also applies to other bitwise intrinsics, e.g. bitreverse (https://alive2.llvm.org/ce/z/Gut-vS) and bswap. I think it would be preferable to directly structure this in terms of "bitwise op with two equal intrinsic IDs".

dtcxzyw · 2024-01-10T09:14:56Z

This also applies to other bitwise intrinsics, e.g. bitreverse (https://alive2.llvm.org/ce/z/Gut-vS) and bswap. I think it would be preferable to directly structure this in terms of "bitwise op with two equal intrinsic IDs".

Done.

PR Link: llvm/llvm-project#77460

nikic

LGTM

This patch does the following folds: ``` bitwise(fshl (A, B, ShAmt), fshl(C, D, ShAmt)) -> fshl(bitwise(A, C), bitwise(B, D), ShAmt) bitwise(fshr (A, B, ShAmt), fshr(C, D, ShAmt)) -> fshr(bitwise(A, C), bitwise(B, D), ShAmt) bitwise(bswap(A), bswap(B)) -> bswap(bitwise(A, B)) bitwise(bswap(A), C) -> bswap(bitwise(A, bswap(C))) bitwise(bitreverse(A), bitreverse(B)) -> bitreverse(bitwise(A, B)) bitwise(bitreverse(A), C) -> bitreverse(bitwise(A, bitreverse(C))) ``` Alive2: https://alive2.llvm.org/ce/z/iZN_TL

dtcxzyw requested a review from nikic as a code owner January 9, 2024 12:57

dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Jan 9, 2024

pre-commit: test PR77460

94fa171

PR Link: llvm/llvm-project#77460

dtcxzyw mentioned this pull request Jan 9, 2024

pre-commit: test PR77460 dtcxzyw/llvm-opt-benchmark#116

Closed

dtcxzyw force-pushed the fold-bitwise-funnel branch from 571ed36 to c21135e Compare January 9, 2024 16:19

llvmbot added the llvm:transforms label Jan 9, 2024

dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Jan 9, 2024

pre-commit: test PR77460

73fbfe0

PR Link: llvm/llvm-project#77460

nikic reviewed Jan 9, 2024

View reviewed changes

dtcxzyw added 2 commits January 10, 2024 17:00

[InstCombine] Add pre-commit tests. NFC.

e8f733f

[InstCombine] Fold bitwise logic with intrinsics

2c812e8

dtcxzyw force-pushed the fold-bitwise-funnel branch from c21135e to 2c812e8 Compare January 10, 2024 09:14

dtcxzyw changed the title ~~[InstCombine] Fold bitwise logic with funnel shifts~~ [InstCombine] Fold bitwise logic with intrinsics Jan 10, 2024

dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Jan 10, 2024

pre-commit: test PR77460

d38d557

PR Link: llvm/llvm-project#77460

nikic approved these changes Jan 10, 2024

View reviewed changes

dtcxzyw merged commit 29f98d6 into llvm:main Jan 10, 2024

dtcxzyw deleted the fold-bitwise-funnel branch January 10, 2024 11:33

dtcxzyw mentioned this pull request Oct 16, 2024

Allow AND-NOT to be emitted from NOT + <some bit/byte reordering> + AND #112425

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[InstCombine] Fold bitwise logic with intrinsics #77460

[InstCombine] Fold bitwise logic with intrinsics #77460

Uh oh!

dtcxzyw commented Jan 9, 2024 •

edited

Loading

Uh oh!

llvmbot commented Jan 9, 2024

Uh oh!

nikic left a comment

Uh oh!

dtcxzyw commented Jan 10, 2024

Uh oh!

nikic left a comment

Uh oh!

Uh oh!

[InstCombine] Fold bitwise logic with intrinsics #77460

[InstCombine] Fold bitwise logic with intrinsics #77460

Uh oh!

Conversation

dtcxzyw commented Jan 9, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jan 9, 2024

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

dtcxzyw commented Jan 10, 2024

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

dtcxzyw commented Jan 9, 2024 •

edited

Loading