Skip to content

[SimplifyCFG] Emit rotl directly in ReduceSwitchRange #77603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jan 10, 2024

Conversation

dtcxzyw
Copy link
Member

@dtcxzyw dtcxzyw commented Jan 10, 2024

This patch emits ROTL(Cond, BitWidth - Shift) directly in ReduceSwitchRange. This should give better codegen because SimplifyDemandedBits will break the rotation patterns in the original form.

See also #73441 and the IR diff https://github.com/dtcxzyw/llvm-opt-benchmark/pull/115/files.
This patch should cover most of cases handled by #73441.

@llvmbot
Copy link
Member

llvmbot commented Jan 10, 2024

@llvm/pr-subscribers-llvm-transforms

Author: Yingwei Zheng (dtcxzyw)

Changes

This patch emits ROTL(Cond, BitWidth - Shift) directly in ReduceSwitchRange. This should give better codegen because SimplifyDemandedBits will break the rotation patterns in the original form.

See also #73441 and the IR diff https://github.com/dtcxzyw/llvm-opt-benchmark/pull/115/files.
This patch should cover most of cases handled by #73441.


Full diff: https://github.com/llvm/llvm-project/pull/77603.diff

2 Files Affected:

  • (modified) llvm/lib/Transforms/Utils/SimplifyCFG.cpp (+6-7)
  • (modified) llvm/test/Transforms/SimplifyCFG/rangereduce.ll (+20-30)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 61d891d65346bd..7515e539e7fb78 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6919,18 +6919,17 @@ static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
 
   auto *Ty = cast<IntegerType>(SI->getCondition()->getType());
   Builder.SetInsertPoint(SI);
-  auto *ShiftC = ConstantInt::get(Ty, Shift);
-  auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
-  auto *LShr = Builder.CreateLShr(Sub, ShiftC);
-  auto *Shl = Builder.CreateShl(Sub, Ty->getBitWidth() - Shift);
-  auto *Rot = Builder.CreateOr(LShr, Shl);
+  Value *Sub =
+      Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
+  Value *Rot = Builder.CreateIntrinsic(
+      Ty, Intrinsic::fshl,
+      {Sub, Sub, ConstantInt::get(Ty, Ty->getBitWidth() - Shift)});
   SI->replaceUsesOfWith(SI->getCondition(), Rot);
 
   for (auto Case : SI->cases()) {
     auto *Orig = Case.getCaseValue();
     auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
-    Case.setValue(
-        cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
+    Case.setValue(cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(Shift))));
   }
   return true;
 }
diff --git a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
index b1a3802a2bb58b..d47bf5f9541881 100644
--- a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
+++ b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
@@ -7,13 +7,11 @@ target datalayout = "e-n32"
 define i32 @test1(i32 %a) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], 97
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP1]], 30
-; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 4
-; CHECK-NEXT:    br i1 [[TMP5]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP1]], i32 30)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       switch.lookup:
-; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @switch.table.test1, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @switch.table.test1, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       common.ret:
@@ -183,13 +181,11 @@ three:
 define i32 @test6(i32 %a) optsize {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], -109
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP1]], 30
-; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 4
-; CHECK-NEXT:    br i1 [[TMP5]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP1]], i32 30)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       switch.lookup:
-; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @switch.table.test6, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @switch.table.test6, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       common.ret:
@@ -218,15 +214,13 @@ define i8 @test7(i8 %a) optsize {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  common.ret:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i8 [[A:%.*]], -36
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i8 [[TMP0]], 6
-; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i8 [[TMP3]], 4
-; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.fshl.i8(i8 [[TMP0]], i8 [[TMP0]], i8 6)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i8 [[TMP1]], 4
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i32 [[SWITCH_CAST]], 8
 ; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i32 -943228976, [[SWITCH_SHIFTAMT]]
 ; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i32 [[SWITCH_DOWNSHIFT]] to i8
-; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[TMP4]], i8 [[SWITCH_MASKED]], i8 -93
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[TMP2]], i8 [[SWITCH_MASKED]], i8 -93
 ; CHECK-NEXT:    ret i8 [[COMMON_RET_OP]]
 ;
   switch i8 %a, label %def [
@@ -250,13 +244,11 @@ three:
 define i32 @test8(i32 %a) optsize {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], 97
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP1]], 30
-; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 5
-; CHECK-NEXT:    br i1 [[TMP5]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP1]], i32 30)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 5
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       switch.lookup:
-; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.test8, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.test8, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       common.ret:
@@ -284,13 +276,11 @@ three:
 define i32 @test9(i32 %a) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], 6
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP1]], 31
-; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8
-; CHECK-NEXT:    br i1 [[TMP5]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP1]], i32 31)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       switch.lookup:
-; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [8 x i32], ptr @switch.table.test9, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [8 x i32], ptr @switch.table.test9, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       common.ret:

dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Jan 10, 2024
@dianqk
Copy link
Member

dianqk commented Jan 10, 2024

LGTM. If fshl is an equivalent alternative or a cheaper alternative.

@dtcxzyw
Copy link
Member Author

dtcxzyw commented Jan 10, 2024

LGTM. If fshl is an equivalent alternative or a cheaper alternative.

We always transform this pattern into fshl in InstCombine. So at least it doesn't cause regressions.

Copy link
Contributor

@nikic nikic left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@dtcxzyw dtcxzyw merged commit 45be680 into llvm:main Jan 10, 2024
@dtcxzyw dtcxzyw deleted the simplifycfg-emit-fshl branch January 10, 2024 14:57
justinfargnoli pushed a commit to justinfargnoli/llvm-project that referenced this pull request Jan 28, 2024
This patch emits `ROTL(Cond, BitWidth - Shift)` directly in
`ReduceSwitchRange`. This should give better codegen because
`SimplifyDemandedBits` will break the rotation patterns in the original
form.

See also llvm#73441 and the IR diff
https://github.com/dtcxzyw/llvm-opt-benchmark/pull/115/files.
This patch should cover most of cases handled by llvm#73441.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

4 participants