Skip to content

Commit e56103d

Browse files
committed
[InstCombine] add multi-use demanded bits fold for add with low-bit mask
I noticed an add example like the one from D91343, so here's a similar patch. The logic is based on existing code for the single-use demanded bits fold. But I only matched a constant instead of using compute known bits on the operands because that was the motivating patterni that I noticed. I think this will allow removing a special-case (but incomplete) dedicated fold within visitAnd(), but I need to untangle the existing code to be sure. https://rise4fun.com/Alive/V6fP Name: add with low mask Pre: (C1 & (-1 u>> countLeadingZeros(C2))) == 0 %a = add i8 %x, C1 %r = and i8 %a, C2 => %r = and i8 %x, C2 Differential Revision: https://reviews.llvm.org/D91415
1 parent 91aa211 commit e56103d

File tree

6 files changed

+31
-8
lines changed

6 files changed

+31
-8
lines changed

llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -826,6 +826,21 @@ Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits(
826826
// do simplifications that apply to *just* the one user if we know that
827827
// this instruction has a simpler value in that context.
828828
switch (I->getOpcode()) {
829+
case Instruction::Add: {
830+
// TODO: Allow undefs and/or non-splat vectors.
831+
const APInt *C;
832+
if (match(I->getOperand(1), m_APInt(C))) {
833+
// Right fill the demanded bits for this add to demand the most
834+
// significant demanded bit and all those below it.
835+
unsigned Ctlz = DemandedMask.countLeadingZeros();
836+
APInt LowMask(APInt::getLowBitsSet(BitWidth, BitWidth - Ctlz));
837+
// If we are adding zeros to every bit below the highest demanded bit,
838+
// just return the add's variable operand.
839+
if ((*C & LowMask).isNullValue())
840+
return I->getOperand(0);
841+
}
842+
break;
843+
}
829844
case Instruction::And: {
830845
// If either the LHS or the RHS are Zero, the result is zero.
831846
computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);

llvm/test/Transforms/InstCombine/and.ll

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1049,11 +1049,13 @@ define <2 x i32> @lowmask_sext_in_reg_splat(<2 x i32> %x, <2 x i32>* %p) {
10491049
ret <2 x i32> %and
10501050
}
10511051

1052+
; Multi-use demanded bits - 'add' doesn't change 'and'
1053+
10521054
define i8 @lowmask_add(i8 %x) {
10531055
; CHECK-LABEL: @lowmask_add(
10541056
; CHECK-NEXT: [[A:%.*]] = add i8 [[X:%.*]], -64
10551057
; CHECK-NEXT: call void @use8(i8 [[A]])
1056-
; CHECK-NEXT: [[R:%.*]] = and i8 [[A]], 32
1058+
; CHECK-NEXT: [[R:%.*]] = and i8 [[X]], 32
10571059
; CHECK-NEXT: ret i8 [[R]]
10581060
;
10591061
%a = add i8 %x, -64 ; 0xc0
@@ -1062,6 +1064,8 @@ define i8 @lowmask_add(i8 %x) {
10621064
ret i8 %r
10631065
}
10641066

1067+
; Negative test - mask overlaps low bit of add
1068+
10651069
define i8 @not_lowmask_add(i8 %x) {
10661070
; CHECK-LABEL: @not_lowmask_add(
10671071
; CHECK-NEXT: [[A:%.*]] = add i8 [[X:%.*]], -64
@@ -1075,6 +1079,8 @@ define i8 @not_lowmask_add(i8 %x) {
10751079
ret i8 %r
10761080
}
10771081

1082+
; Negative test - mask overlaps low bit of add
1083+
10781084
define i8 @not_lowmask_add2(i8 %x) {
10791085
; CHECK-LABEL: @not_lowmask_add2(
10801086
; CHECK-NEXT: [[A:%.*]] = add i8 [[X:%.*]], -96
@@ -1088,11 +1094,13 @@ define i8 @not_lowmask_add2(i8 %x) {
10881094
ret i8 %r
10891095
}
10901096

1097+
; Multi-use demanded bits - 'add' doesn't change 'and'
1098+
10911099
define <2 x i8> @lowmask_add_splat(<2 x i8> %x, <2 x i8>* %p) {
10921100
; CHECK-LABEL: @lowmask_add_splat(
10931101
; CHECK-NEXT: [[A:%.*]] = add <2 x i8> [[X:%.*]], <i8 -64, i8 -64>
10941102
; CHECK-NEXT: store <2 x i8> [[A]], <2 x i8>* [[P:%.*]], align 2
1095-
; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[A]], <i8 32, i8 32>
1103+
; CHECK-NEXT: [[R:%.*]] = and <2 x i8> [[X]], <i8 32, i8 32>
10961104
; CHECK-NEXT: ret <2 x i8> [[R]]
10971105
;
10981106
%a = add <2 x i8> %x, <i8 -64, i8 -64> ; 0xc0

llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ define void @fp_iv_loop1(float* noalias nocapture %A, i32 %N) #0 {
3535
; AUTO_VEC-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP1]], 96
3636
; AUTO_VEC-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
3737
; AUTO_VEC: vector.ph.new:
38-
; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], 1152921504606846972
38+
; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP3]], -4
3939
; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]]
4040
; AUTO_VEC: vector.body:
4141
; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]
@@ -306,7 +306,7 @@ define double @external_use_with_fast_math(double* %a, i64 %n) {
306306
; AUTO_VEC-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP2]], 48
307307
; AUTO_VEC-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
308308
; AUTO_VEC: vector.ph.new:
309-
; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP4]], 2305843009213693948
309+
; AUTO_VEC-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP4]], -4
310310
; AUTO_VEC-NEXT: br label [[VECTOR_BODY:%.*]]
311311
; AUTO_VEC: vector.body:
312312
; AUTO_VEC-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH_NEW]] ], [ [[INDEX_NEXT_3:%.*]], [[VECTOR_BODY]] ]

llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
2525
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
2626
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
2727
; CHECK: vector.ph:
28-
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588
28+
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4
2929
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
3030
; CHECK: vector.body:
3131
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]

llvm/test/Transforms/LoopVectorize/runtime-check.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function foo
22
; RUN: opt < %s -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
33
; RUN: opt < %s -loop-vectorize -disable-basic-aa -S -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s -check-prefix=FORCED_OPTSIZE
44

@@ -32,7 +32,7 @@ define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtab
3232
; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]], [[DBG9]]
3333
; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]], [[DBG9]]
3434
; CHECK: vector.ph:
35-
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], 8589934588, [[DBG9]]
35+
; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[TMP2]], -4, [[DBG9]]
3636
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]], [[DBG9]]
3737
; CHECK: vector.body:
3838
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [[DBG9]]

llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ define void @vdiv(double* %x, double* %y, double %a, i32 %N) #0 {
3838
; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], 12
3939
; CHECK-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK_UNR_LCSSA:%.*]], label [[VECTOR_PH_NEW:%.*]]
4040
; CHECK: vector.ph.new:
41-
; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP2]], 9223372036854775804
41+
; CHECK-NEXT: [[UNROLL_ITER:%.*]] = and i64 [[TMP2]], -4
4242
; CHECK-NEXT: [[TMP4:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
4343
; CHECK-NEXT: [[TMP5:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]
4444
; CHECK-NEXT: [[TMP6:%.*]] = fdiv fast <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, [[BROADCAST_SPLAT]]

0 commit comments

Comments
 (0)