[SimplifyCFG] Emit `rotl` directly in `ReduceSwitchRange` #77603

dtcxzyw · 2024-01-10T13:25:28Z

This patch emits ROTL(Cond, BitWidth - Shift) directly in ReduceSwitchRange. This should give better codegen because SimplifyDemandedBits will break the rotation patterns in the original form.

See also #73441 and the IR diff https://github.com/dtcxzyw/llvm-opt-benchmark/pull/115/files.
This patch should cover most of cases handled by #73441.

llvmbot · 2024-01-10T13:25:58Z

@llvm/pr-subscribers-llvm-transforms

Author: Yingwei Zheng (dtcxzyw)

Changes

This patch emits ROTL(Cond, BitWidth - Shift) directly in ReduceSwitchRange. This should give better codegen because SimplifyDemandedBits will break the rotation patterns in the original form.

See also #73441 and the IR diff https://github.com/dtcxzyw/llvm-opt-benchmark/pull/115/files.
This patch should cover most of cases handled by #73441.

Full diff: https://github.com/llvm/llvm-project/pull/77603.diff

2 Files Affected:

(modified) llvm/lib/Transforms/Utils/SimplifyCFG.cpp (+6-7)
(modified) llvm/test/Transforms/SimplifyCFG/rangereduce.ll (+20-30)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 61d891d65346bd..7515e539e7fb78 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -6919,18 +6919,17 @@ static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
 
   auto *Ty = cast<IntegerType>(SI->getCondition()->getType());
   Builder.SetInsertPoint(SI);
-  auto *ShiftC = ConstantInt::get(Ty, Shift);
-  auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
-  auto *LShr = Builder.CreateLShr(Sub, ShiftC);
-  auto *Shl = Builder.CreateShl(Sub, Ty->getBitWidth() - Shift);
-  auto *Rot = Builder.CreateOr(LShr, Shl);
+  Value *Sub =
+      Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
+  Value *Rot = Builder.CreateIntrinsic(
+      Ty, Intrinsic::fshl,
+      {Sub, Sub, ConstantInt::get(Ty, Ty->getBitWidth() - Shift)});
   SI->replaceUsesOfWith(SI->getCondition(), Rot);
 
   for (auto Case : SI->cases()) {
     auto *Orig = Case.getCaseValue();
     auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
-    Case.setValue(
-        cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
+    Case.setValue(cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(Shift))));
   }
   return true;
 }
diff --git a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
index b1a3802a2bb58b..d47bf5f9541881 100644
--- a/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
+++ b/llvm/test/Transforms/SimplifyCFG/rangereduce.ll
@@ -7,13 +7,11 @@ target datalayout = "e-n32"
 define i32 @test1(i32 %a) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], 97
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP1]], 30
-; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 4
-; CHECK-NEXT:    br i1 [[TMP5]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP1]], i32 30)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       switch.lookup:
-; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @switch.table.test1, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @switch.table.test1, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       common.ret:
@@ -183,13 +181,11 @@ three:
 define i32 @test6(i32 %a) optsize {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], -109
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP1]], 30
-; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 4
-; CHECK-NEXT:    br i1 [[TMP5]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP1]], i32 30)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       switch.lookup:
-; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @switch.table.test6, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [4 x i32], ptr @switch.table.test6, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       common.ret:
@@ -218,15 +214,13 @@ define i8 @test7(i8 %a) optsize {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:  common.ret:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sub i8 [[A:%.*]], -36
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i8 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i8 [[TMP0]], 6
-; CHECK-NEXT:    [[TMP3:%.*]] = or i8 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult i8 [[TMP3]], 4
-; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = zext i8 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = call i8 @llvm.fshl.i8(i8 [[TMP0]], i8 [[TMP0]], i8 6)
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i8 [[TMP1]], 4
+; CHECK-NEXT:    [[SWITCH_CAST:%.*]] = zext i8 [[TMP1]] to i32
 ; CHECK-NEXT:    [[SWITCH_SHIFTAMT:%.*]] = mul nuw nsw i32 [[SWITCH_CAST]], 8
 ; CHECK-NEXT:    [[SWITCH_DOWNSHIFT:%.*]] = lshr i32 -943228976, [[SWITCH_SHIFTAMT]]
 ; CHECK-NEXT:    [[SWITCH_MASKED:%.*]] = trunc i32 [[SWITCH_DOWNSHIFT]] to i8
-; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[TMP4]], i8 [[SWITCH_MASKED]], i8 -93
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[TMP2]], i8 [[SWITCH_MASKED]], i8 -93
 ; CHECK-NEXT:    ret i8 [[COMMON_RET_OP]]
 ;
   switch i8 %a, label %def [
@@ -250,13 +244,11 @@ three:
 define i32 @test8(i32 %a) optsize {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], 97
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP1]], 30
-; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 5
-; CHECK-NEXT:    br i1 [[TMP5]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP1]], i32 30)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 5
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       switch.lookup:
-; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.test8, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [5 x i32], ptr @switch.table.test8, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       common.ret:
@@ -284,13 +276,11 @@ three:
 define i32 @test9(i32 %a) {
 ; CHECK-LABEL: @test9(
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[A:%.*]], 6
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP1]], 31
-; CHECK-NEXT:    [[TMP4:%.*]] = or i32 [[TMP2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i32 [[TMP4]], 8
-; CHECK-NEXT:    br i1 [[TMP5]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.fshl.i32(i32 [[TMP1]], i32 [[TMP1]], i32 31)
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult i32 [[TMP2]], 8
+; CHECK-NEXT:    br i1 [[TMP3]], label [[SWITCH_LOOKUP:%.*]], label [[COMMON_RET:%.*]]
 ; CHECK:       switch.lookup:
-; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [8 x i32], ptr @switch.table.test9, i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[SWITCH_GEP:%.*]] = getelementptr inbounds [8 x i32], ptr @switch.table.test9, i32 0, i32 [[TMP2]]
 ; CHECK-NEXT:    [[SWITCH_LOAD:%.*]] = load i32, ptr [[SWITCH_GEP]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ; CHECK:       common.ret:

PR Link: llvm/llvm-project#77603

dianqk · 2024-01-10T13:30:26Z

LGTM. If fshl is an equivalent alternative or a cheaper alternative.

dtcxzyw · 2024-01-10T13:33:56Z

LGTM. If fshl is an equivalent alternative or a cheaper alternative.

We always transform this pattern into fshl in InstCombine. So at least it doesn't cause regressions.

nikic

LGTM

This patch emits `ROTL(Cond, BitWidth - Shift)` directly in `ReduceSwitchRange`. This should give better codegen because `SimplifyDemandedBits` will break the rotation patterns in the original form. See also llvm#73441 and the IR diff https://github.com/dtcxzyw/llvm-opt-benchmark/pull/115/files. This patch should cover most of cases handled by llvm#73441.

[SimplifyCFG] Emit rotl directly in ReduceSwitchRange

0e1c269

dtcxzyw requested review from nikic and dianqk January 10, 2024 13:25

llvmbot added the llvm:transforms label Jan 10, 2024

dtcxzyw added a commit to dtcxzyw/llvm-opt-benchmark that referenced this pull request Jan 10, 2024

pre-commit: test PR77603

2c2c59e

PR Link: llvm/llvm-project#77603

dtcxzyw mentioned this pull request Jan 10, 2024

pre-commit: test PR77603 dtcxzyw/llvm-opt-benchmark#119

Closed

nikic approved these changes Jan 10, 2024

View reviewed changes

dtcxzyw merged commit 45be680 into llvm:main Jan 10, 2024

dtcxzyw deleted the simplifycfg-emit-fshl branch January 10, 2024 14:57

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[SimplifyCFG] Emit `rotl` directly in `ReduceSwitchRange` #77603

[SimplifyCFG] Emit `rotl` directly in `ReduceSwitchRange` #77603

Uh oh!

dtcxzyw commented Jan 10, 2024

Uh oh!

llvmbot commented Jan 10, 2024

Uh oh!

dianqk commented Jan 10, 2024

Uh oh!

dtcxzyw commented Jan 10, 2024

Uh oh!

nikic left a comment

Uh oh!

Uh oh!

[SimplifyCFG] Emit rotl directly in ReduceSwitchRange #77603

[SimplifyCFG] Emit rotl directly in ReduceSwitchRange #77603

Uh oh!

Conversation

dtcxzyw commented Jan 10, 2024

Uh oh!

llvmbot commented Jan 10, 2024

Uh oh!

dianqk commented Jan 10, 2024

Uh oh!

dtcxzyw commented Jan 10, 2024

Uh oh!

nikic left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

[SimplifyCFG] Emit `rotl` directly in `ReduceSwitchRange` #77603

[SimplifyCFG] Emit `rotl` directly in `ReduceSwitchRange` #77603