[InstCombine] matchRotate - support (uniform) constant rotation amounts (PR46895)

RKSimon · RKSimon · commit 9ff9c1d8ee1d · 2020-09-25T22:03:10.000+01:00
This patch adds handling of rotation patterns with constant shift amounts - the next bit will be how we want to support non-uniform constant vectors. Differential Revision: https://reviews.llvm.org/D87452
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2087,8 +2087,6 @@ static Instruction *matchRotate(Instruction &Or) {
   // TODO: Can we reduce the code duplication between this and the related
   // rotate matching code under visitSelect and visitTrunc?
   unsigned Width = Or.getType()->getScalarSizeInBits();
-  if (!isPowerOf2_32(Width))
-    return nullptr;
 
   // First, find an or'd pair of opposite shifts with the same shifted operand:
   // or (lshr ShVal, ShAmt0), (shl ShVal, ShAmt1)
@@ -2110,6 +2108,18 @@ static Instruction *matchRotate(Instruction &Or) {
   // Match the shift amount operands for a rotate pattern. This always matches
   // a subtraction on the R operand.
   auto matchShiftAmount = [](Value *L, Value *R, unsigned Width) -> Value * {
+    // Check for constant shift amounts that sum to the bitwidth.
+    // TODO: Support non-uniform shift amounts.
+    const APInt *LC, *RC;
+    if (match(L, m_APInt(LC)) && match(R, m_APInt(RC)))
+      if (LC->ult(Width) && RC->ult(Width) && (*LC + *RC) == Width)
+        return L;
+
+    // For non-constant cases we don't support non-pow2 shift masks.
+    // TODO: Is it worth matching urem as well?
+    if (!isPowerOf2_32(Width))
+      return nullptr;
+
     // The shift amount may be masked with negation:
     // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1)))
     Value *X;
diff --git a/llvm/test/Transforms/InstCombine/bswap.ll b/llvm/test/Transforms/InstCombine/bswap.ll
@@ -123,9 +123,7 @@ define i32 @bswap32_and_first(i32 %x) {
 
 define i32 @bswap32_and_first_extra_use(i32 %x) {
 ; CHECK-LABEL: @bswap32_and_first_extra_use(
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 16
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 16
-; CHECK-NEXT:    [[SWAPHALF:%.*]] = or i32 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[SWAPHALF:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 16)
 ; CHECK-NEXT:    [[T:%.*]] = and i32 [[SWAPHALF]], 16711935
 ; CHECK-NEXT:    [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X]])
 ; CHECK-NEXT:    call void @extra_use(i32 [[T]])
@@ -169,10 +167,8 @@ define i32 @bswap32_shl_first(i32 %x) {
 
 define i32 @bswap32_shl_first_extra_use(i32 %x) {
 ; CHECK-LABEL: @bswap32_shl_first_extra_use(
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X:%.*]], 16
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[X]], 24
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i32 [[SHR]], 8
-; CHECK-NEXT:    [[T:%.*]] = or i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[SWAPHALF:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 16)
+; CHECK-NEXT:    [[T:%.*]] = shl i32 [[SWAPHALF]], 8
 ; CHECK-NEXT:    [[BSWAP:%.*]] = call i32 @llvm.bswap.i32(i32 [[X]])
 ; CHECK-NEXT:    call void @extra_use(i32 [[T]])
 ; CHECK-NEXT:    ret i32 [[BSWAP]]
diff --git a/llvm/test/Transforms/InstCombine/fsh.ll b/llvm/test/Transforms/InstCombine/fsh.ll
@@ -521,9 +521,9 @@ define i33 @fshr_multi_use(i33 %a) {
 
 define i33 @expanded_fshr_multi_use(i33 %a) {
 ; CHECK-LABEL: @expanded_fshr_multi_use(
-; CHECK-NEXT:    [[TMP:%.*]] = lshr i33 [[A:%.*]], 1
-; CHECK-NEXT:    [[C:%.*]] = lshr i33 [[A]], 24
-; CHECK-NEXT:    [[D:%.*]] = xor i33 [[C]], [[TMP]]
+; CHECK-NEXT:    [[B:%.*]] = call i33 @llvm.fshl.i33(i33 [[A:%.*]], i33 [[A]], i33 32)
+; CHECK-NEXT:    [[C:%.*]] = lshr i33 [[B]], 23
+; CHECK-NEXT:    [[D:%.*]] = xor i33 [[C]], [[B]]
 ; CHECK-NEXT:    [[E:%.*]] = and i33 [[D]], 31
 ; CHECK-NEXT:    ret i33 [[E]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/or-concat.ll b/llvm/test/Transforms/InstCombine/or-concat.ll
@@ -47,11 +47,9 @@ define <2 x i64> @concat_bswap32_unary_split_vector(<2 x i64> %a0) {
 
 define i64 @concat_bswap32_unary_flip(i64 %a0) {
 ; CHECK-LABEL: @concat_bswap32_unary_flip(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[A0:%.*]], 32
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[A0]], 32
-; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP3]])
-; CHECK-NEXT:    ret i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.fshl.i64(i64 [[A0:%.*]], i64 [[A0]], i64 32)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
   %1 = lshr i64 %a0, 32
   %2 = trunc i64 %1 to i32
@@ -67,11 +65,9 @@ define i64 @concat_bswap32_unary_flip(i64 %a0) {
 
 define <2 x i64> @concat_bswap32_unary_flip_vector(<2 x i64> %a0) {
 ; CHECK-LABEL: @concat_bswap32_unary_flip_vector(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> [[A0:%.*]], <i64 32, i64 32>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[A0]], <i64 32, i64 32>
-; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP3]])
-; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[A0:%.*]], <2 x i64> [[A0]], <2 x i64> <i64 32, i64 32>)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 ;
   %1 = lshr <2 x i64> %a0, <i64 32, i64 32>
   %2 = trunc <2 x i64> %1 to <2 x i32>
@@ -162,11 +158,9 @@ define <2 x i64> @concat_bitreverse32_unary_split_vector(<2 x i64> %a0) {
 
 define i64 @concat_bitreverse32_unary_flip(i64 %a0) {
 ; CHECK-LABEL: @concat_bitreverse32_unary_flip(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[A0:%.*]], 32
-; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[A0]], 32
-; CHECK-NEXT:    [[TMP3:%.*]] = or i64 [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[TMP3]])
-; CHECK-NEXT:    ret i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.fshl.i64(i64 [[A0:%.*]], i64 [[A0]], i64 32)
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
 ;
   %1 = lshr i64 %a0, 32
   %2 = trunc i64 %1 to i32
@@ -182,11 +176,9 @@ define i64 @concat_bitreverse32_unary_flip(i64 %a0) {
 
 define <2 x i64> @concat_bitreverse32_unary_flip_vector(<2 x i64> %a0) {
 ; CHECK-LABEL: @concat_bitreverse32_unary_flip_vector(
-; CHECK-NEXT:    [[TMP1:%.*]] = lshr <2 x i64> [[A0:%.*]], <i64 32, i64 32>
-; CHECK-NEXT:    [[TMP2:%.*]] = shl <2 x i64> [[A0]], <i64 32, i64 32>
-; CHECK-NEXT:    [[TMP3:%.*]] = or <2 x i64> [[TMP1]], [[TMP2]]
-; CHECK-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP3]])
-; CHECK-NEXT:    ret <2 x i64> [[TMP4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> [[A0:%.*]], <2 x i64> [[A0]], <2 x i64> <i64 32, i64 32>)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; CHECK-NEXT:    ret <2 x i64> [[TMP2]]
 ;
   %1 = lshr <2 x i64> %a0, <i64 32, i64 32>
   %2 = trunc <2 x i64> %1 to <2 x i32>
diff --git a/llvm/test/Transforms/InstCombine/rotate.ll b/llvm/test/Transforms/InstCombine/rotate.ll
@@ -3,16 +3,14 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
-; TODO: Canonicalize rotate by constant to funnel shift intrinsics.
+; Canonicalize rotate by constant to funnel shift intrinsics.
 ; This should help cost modeling for vectorization, inlining, etc.
 ; If a target does not have a rotate instruction, the expansion will
 ; be exactly these same 3 basic ops (shl/lshr/or).
 
 define i32 @rotl_i32_constant(i32 %x) {
 ; CHECK-LABEL: @rotl_i32_constant(
-; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[X:%.*]], 11
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[X]], 21
-; CHECK-NEXT:    [[R:%.*]] = or i32 [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.fshl.i32(i32 [[X:%.*]], i32 [[X]], i32 11)
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
   %shl = shl i32 %x, 11
@@ -23,9 +21,7 @@ define i32 @rotl_i32_constant(i32 %x) {
 
 define i42 @rotr_i42_constant(i42 %x) {
 ; CHECK-LABEL: @rotr_i42_constant(
-; CHECK-NEXT:    [[SHL:%.*]] = shl i42 [[X:%.*]], 31
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i42 [[X]], 11
-; CHECK-NEXT:    [[R:%.*]] = or i42 [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = call i42 @llvm.fshl.i42(i42 [[X:%.*]], i42 [[X]], i42 31)
 ; CHECK-NEXT:    ret i42 [[R]]
 ;
   %shl = shl i42 %x, 31
@@ -36,9 +32,7 @@ define i42 @rotr_i42_constant(i42 %x) {
 
 define i8 @rotr_i8_constant_commute(i8 %x) {
 ; CHECK-LABEL: @rotr_i8_constant_commute(
-; CHECK-NEXT:    [[SHL:%.*]] = shl i8 [[X:%.*]], 5
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i8 [[X]], 3
-; CHECK-NEXT:    [[R:%.*]] = or i8 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.fshl.i8(i8 [[X:%.*]], i8 [[X]], i8 5)
 ; CHECK-NEXT:    ret i8 [[R]]
 ;
   %shl = shl i8 %x, 5
@@ -49,9 +43,7 @@ define i8 @rotr_i8_constant_commute(i8 %x) {
 
 define i88 @rotl_i88_constant_commute(i88 %x) {
 ; CHECK-LABEL: @rotl_i88_constant_commute(
-; CHECK-NEXT:    [[SHL:%.*]] = shl i88 [[X:%.*]], 44
-; CHECK-NEXT:    [[SHR:%.*]] = lshr i88 [[X]], 44
-; CHECK-NEXT:    [[R:%.*]] = or i88 [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call i88 @llvm.fshl.i88(i88 [[X:%.*]], i88 [[X]], i88 44)
 ; CHECK-NEXT:    ret i88 [[R]]
 ;
   %shl = shl i88 %x, 44
@@ -64,9 +56,7 @@ define i88 @rotl_i88_constant_commute(i88 %x) {
 
 define <2 x i16> @rotl_v2i16_constant_splat(<2 x i16> %x) {
 ; CHECK-LABEL: @rotl_v2i16_constant_splat(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i16> [[X:%.*]], <i16 1, i16 1>
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i16> [[X]], <i16 15, i16 15>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i16> [[SHL]], [[SHR]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i16> @llvm.fshl.v2i16(<2 x i16> [[X:%.*]], <2 x i16> [[X]], <2 x i16> <i16 1, i16 1>)
 ; CHECK-NEXT:    ret <2 x i16> [[R]]
 ;
   %shl = shl <2 x i16> %x, <i16 1, i16 1>
@@ -79,9 +69,7 @@ define <2 x i16> @rotl_v2i16_constant_splat(<2 x i16> %x) {
 
 define <2 x i17> @rotr_v2i17_constant_splat(<2 x i17> %x) {
 ; CHECK-LABEL: @rotr_v2i17_constant_splat(
-; CHECK-NEXT:    [[SHL:%.*]] = shl <2 x i17> [[X:%.*]], <i17 12, i17 12>
-; CHECK-NEXT:    [[SHR:%.*]] = lshr <2 x i17> [[X]], <i17 5, i17 5>
-; CHECK-NEXT:    [[R:%.*]] = or <2 x i17> [[SHR]], [[SHL]]
+; CHECK-NEXT:    [[R:%.*]] = call <2 x i17> @llvm.fshl.v2i17(<2 x i17> [[X:%.*]], <2 x i17> [[X]], <2 x i17> <i17 12, i17 12>)
 ; CHECK-NEXT:    ret <2 x i17> [[R]]
 ;
   %shl = shl <2 x i17> %x, <i17 12, i17 12>
@@ -90,7 +78,7 @@ define <2 x i17> @rotr_v2i17_constant_splat(<2 x i17> %x) {
   ret <2 x i17> %r
 }
 
-; Allow arbitrary shift constants.
+; TODO: Allow arbitrary shift constants.
 
 define <2 x i32> @rotr_v2i32_constant_nonsplat(<2 x i32> %x) {
 ; CHECK-LABEL: @rotr_v2i32_constant_nonsplat(