Skip to content

Commit 795b067

Browse files
author
Ehsan Amiri
committed
[InstCombine] New opportunities for FoldAndOfICmp and FoldXorOfICmp
A number of new patterns for simplifying and/xor of icmp: (icmp ne %x, 0) ^ (icmp ne %y, 0) => icmp ne %x, %y if the following is true: 1- (%x = and %a, %mask) and (%y = and %b, %mask) 2- %mask is a power of 2. (icmp eq %x, 0) & (icmp ne %y, 0) => icmp ult %x, %y if the following is true: 1- (%x = and %a, %mask1) and (%y = and %b, %mask2) 2- Let %t be the smallest power of 2 where %mask1 & %t != 0. Then for any %s that is a power of 2 and %s & %mask2 != 0, we must have %s <= %t. For example if %mask1 = 24 and %mask2 = 16, setting %s = 16 and %t = 8 violates condition (2) above. So this optimization cannot be applied. llvm-svn: 289813
1 parent 3da2619 commit 795b067

File tree

3 files changed

+302
-2
lines changed

3 files changed

+302
-2
lines changed

llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp

Lines changed: 97 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -733,6 +733,44 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
733733
return nullptr;
734734
}
735735

736+
namespace {
737+
738+
struct BitGroupCheck {
739+
// If the Cmp, checks the bits in the group are nonzero?
740+
bool CheckIfSet {false};
741+
// The mask that identifies the bitgroup in question.
742+
const APInt *Mask {nullptr};
743+
};
744+
}
745+
/// For an ICMP where RHS is zero, we want to check if the ICMP is equivalent to
746+
/// comparing a group of bits in an integer value against zero.
747+
BitGroupCheck isAnyBitSet(Value *LHS, ICmpInst::Predicate CC) {
748+
749+
BitGroupCheck BGC;
750+
auto *Inst = dyn_cast<Instruction>(LHS);
751+
752+
if (!Inst || Inst->getOpcode() != Instruction::And)
753+
return BGC;
754+
755+
// TODO Currently this does not work for vectors.
756+
ConstantInt *Mask;
757+
if (!match(LHS, m_And(m_Value(), m_ConstantInt(Mask))))
758+
return BGC;
759+
// At this point we know that LHS of ICMP is "and" of a value with a constant.
760+
// Also we know that the RHS is zero. That means we are checking if a certain
761+
// group of bits in a given integer value are all zero or at least one of them
762+
// is set to one.
763+
if (CC == ICmpInst::ICMP_EQ)
764+
BGC.CheckIfSet = false;
765+
else if (CC == ICmpInst::ICMP_NE)
766+
BGC.CheckIfSet = true;
767+
else
768+
return BGC;
769+
770+
BGC.Mask = &Mask->getValue();
771+
return BGC;
772+
}
773+
736774
/// Try to fold a signed range checked with lower bound 0 to an unsigned icmp.
737775
/// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
738776
/// If \p Inverted is true then the check is for the inverted range, e.g.
@@ -789,6 +827,32 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
789827
return Builder->CreateICmp(NewPred, Input, RangeEnd);
790828
}
791829

830+
Value *InstCombiner::FoldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
831+
832+
Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
833+
// TODO The lines below does not work for vectors. ConstantInt is scalar.
834+
auto *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
835+
auto *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
836+
if (!LHSCst || !RHSCst)
837+
return nullptr;
838+
ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
839+
840+
// E.g. (icmp ne %x, 0) ^ (icmp ne %y, 0) => icmp ne %x, %y if the following
841+
// conditions hold:
842+
// 1- (%x = and %a, %mask) and (%y = and %b, %mask)
843+
// 2- %mask is a power of 2.
844+
if (RHSCst->isZero() && LHSCst == RHSCst) {
845+
846+
BitGroupCheck BGC1 = isAnyBitSet(Val, LHSCC);
847+
BitGroupCheck BGC2 = isAnyBitSet(Val2, RHSCC);
848+
if (BGC1.Mask && BGC2.Mask && BGC1.CheckIfSet == BGC2.CheckIfSet &&
849+
*BGC1.Mask == *BGC2.Mask && BGC1.Mask->isPowerOf2()) {
850+
return Builder->CreateICmp(ICmpInst::ICMP_NE, Val2, Val);
851+
}
852+
}
853+
return nullptr;
854+
}
855+
792856
/// Fold (icmp)&(icmp) if possible.
793857
Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
794858
ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
@@ -871,6 +935,29 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
871935
}
872936
}
873937

938+
// E.g. (icmp eq %x, 0) & (icmp ne %y, 0) => icmp ult %x, %y if the following
939+
// conditions hold:
940+
// 1- (%x = and %a, %mask1) and (%y = and %b, %mask2)
941+
// 2- Let %t be the smallest power of 2 where %mask1 & %t != 0. Then for any
942+
// %s that is a power of 2 and %s & %mask2 != 0, we must have %s <= %t.
943+
// For example if %mask1 = 24 and %mask2 = 16, setting %s = 16 and %t = 8
944+
// violates condition (2) above. So this optimization cannot be applied.
945+
if (RHSCst->isZero() && LHSCst == RHSCst) {
946+
BitGroupCheck BGC1 = isAnyBitSet(Val, LHSCC);
947+
BitGroupCheck BGC2 = isAnyBitSet(Val2, RHSCC);
948+
949+
if (BGC1.Mask && BGC2.Mask && (BGC1.CheckIfSet != BGC2.CheckIfSet)) {
950+
if (!BGC1.CheckIfSet &&
951+
BGC1.Mask->countTrailingZeros() >=
952+
BGC2.Mask->getBitWidth() - BGC2.Mask->countLeadingZeros() - 1)
953+
return Builder->CreateICmp(ICmpInst::ICMP_ULT, Val, Val2);
954+
else if (!BGC2.CheckIfSet &&
955+
BGC2.Mask->countTrailingZeros() >=
956+
BGC1.Mask->getBitWidth() - BGC1.Mask->countLeadingZeros() - 1)
957+
return Builder->CreateICmp(ICmpInst::ICMP_ULT, Val2, Val);
958+
}
959+
}
960+
874961
// From here on, we only handle:
875962
// (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
876963
if (Val != Val2) return nullptr;
@@ -2704,9 +2791,16 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
27042791
match(Op1, m_Not(m_Specific(A))))
27052792
return BinaryOperator::CreateNot(Builder->CreateAnd(A, B));
27062793

2707-
// (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
27082794
if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
2709-
if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
2795+
if (ICmpInst *LHS = dyn_cast<ICmpInst>(I.getOperand(0))) {
2796+
2797+
// E.g. if we have xor (icmp eq %A, 0), (icmp eq %B, 0)
2798+
// and we know both A and B are either 8 (power of 2) or 0
2799+
// we can simplify to (icmp ne A, B).
2800+
if (Value *Res = FoldXorOfICmps(LHS, RHS))
2801+
return replaceInstUsesWith(I, Res);
2802+
2803+
// (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
27102804
if (PredicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
27112805
if (LHS->getOperand(0) == RHS->getOperand(1) &&
27122806
LHS->getOperand(1) == RHS->getOperand(0))
@@ -2721,6 +2815,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
27212815
Builder));
27222816
}
27232817
}
2818+
}
27242819

27252820
if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
27262821
return CastedXor;

llvm/lib/Transforms/InstCombine/InstCombineInternal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,7 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner
239239
Instruction *visitFDiv(BinaryOperator &I);
240240
Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);
241241
Value *FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS);
242+
Value *FoldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS);
242243
Value *FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS);
243244
Instruction *visitAnd(BinaryOperator &I);
244245
Value *FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction *CxtI);

llvm/test/Transforms/InstCombine/and-or-icmps.ll

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,207 @@ define i1 @test(i32 %tmp1030) {
5151
ret i1 %tmp1042
5252
}
5353

54+
; Last three instructions (ignoring ret) are equivalent of %val2 < %val1.
55+
define i1 @test2(i32 %a, i32 %b) {
56+
; CHECK-LABEL: @test2(
57+
; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 8
58+
; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 8
59+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]]
60+
; CHECK-NEXT: ret i1 [[TMP1]]
61+
;
62+
%val1 = and i32 %a, 8
63+
%val2 = and i32 %b, 8
64+
%cmp.a = icmp ne i32 %val1, 0
65+
%cmp.b = icmp eq i32 %val2, 0
66+
%and = and i1 %cmp.b, %cmp.a
67+
ret i1 %and
68+
}
69+
70+
; Last three instructions (ignoring ret) are equivalent of %val2 < %val1.
71+
define i1 @test3(i32 %a, i32 %b) {
72+
; CHECK-LABEL: @test3(
73+
; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 8
74+
; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 8
75+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]]
76+
; CHECK-NEXT: ret i1 [[TMP1]]
77+
;
78+
%val1 = and i32 %a, 8
79+
%val2 = and i32 %b, 8
80+
%cmp.a = icmp ne i32 %val1, 0
81+
%cmp.b = icmp eq i32 %val2, 0
82+
%and = and i1 %cmp.a, %cmp.b
83+
ret i1 %and
84+
}
85+
86+
; Last three instructions (ignoring ret) are equivalent of %val2 < %val1.
87+
define i1 @test4(i32 %a, i32 %b) {
88+
; CHECK-LABEL: @test4(
89+
; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 15
90+
; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 24
91+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]]
92+
; CHECK-NEXT: ret i1 [[TMP1]]
93+
;
94+
%val1 = and i32 %a, 15
95+
%val2 = and i32 %b, 24
96+
%cmp.a = icmp ne i32 %val1, 0
97+
%cmp.b = icmp eq i32 %val2, 0
98+
%and = and i1 %cmp.a, %cmp.b
99+
ret i1 %and
100+
}
101+
102+
; Last three instructions (ignoring ret) are equivalent of %val2 < %val1.
103+
define i1 @test5(i32 %a, i32 %b) {
104+
; CHECK-LABEL: @test5(
105+
; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 15
106+
; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 24
107+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]]
108+
; CHECK-NEXT: ret i1 [[TMP1]]
109+
;
110+
%val1 = and i32 %a, 15
111+
%val2 = and i32 %b, 24
112+
%cmp.a = icmp ne i32 %val1, 0
113+
%cmp.b = icmp eq i32 %val2, 0
114+
%and = and i1 %cmp.b, %cmp.a
115+
ret i1 %and
116+
}
117+
118+
; An optimization like those of previous tests is not possible
119+
; for example if %b = 8 and %a = 16, we have %val2 = 8 and
120+
; % %val1 = 16 so %val2 < %val1 but %and == 0.
121+
define i1 @test6(i32 %a, i32 %b) {
122+
; CHECK-LABEL: @test6(
123+
; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 16
124+
; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 24
125+
; CHECK-NEXT: [[CMP_A:%.*]] = icmp ne i32 [[VAL1]], 0
126+
; CHECK-NEXT: [[CMP_B:%.*]] = icmp eq i32 [[VAL2]], 0
127+
; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP_B]], [[CMP_A]]
128+
; CHECK-NEXT: ret i1 [[AND]]
129+
;
130+
%val1 = and i32 %a, 16
131+
%val2 = and i32 %b, 24
132+
%cmp.a = icmp ne i32 %val1, 0
133+
%cmp.b = icmp eq i32 %val2, 0
134+
%and = and i1 %cmp.b, %cmp.a
135+
ret i1 %and
136+
}
137+
138+
; %a and %b have different widths. So optimization is not possible.
139+
define i1 @test7(i16 %a, i32 %b) {
140+
; CHECK-LABEL: @test7(
141+
; CHECK-NEXT: [[VAL1:%.*]] = and i16 %a, 15
142+
; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 24
143+
; CHECK-NEXT: [[CMP_A:%.*]] = icmp ne i16 [[VAL1]], 0
144+
; CHECK-NEXT: [[CMP_B:%.*]] = icmp eq i32 [[VAL2]], 0
145+
; CHECK-NEXT: [[AND:%.*]] = and i1 [[CMP_B]], [[CMP_A]]
146+
; CHECK-NEXT: ret i1 [[AND]]
147+
;
148+
%val1 = and i16 %a, 15
149+
%val2 = and i32 %b, 24
150+
%cmp.a = icmp ne i16 %val1, 0
151+
%cmp.b = icmp eq i32 %val2, 0
152+
%and = and i1 %cmp.b, %cmp.a
153+
ret i1 %and
154+
}
155+
156+
; The last three instructions can be simplified to checking %val1 != %val2.
157+
; After that other transformations change the code further.
158+
define i1 @test8(i32 %a, i32 %b) {
159+
; CHECK-LABEL: @test8(
160+
; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, %b
161+
; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 8
162+
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
163+
; CHECK-NEXT: ret i1 [[TMP3]]
164+
;
165+
%val1 = and i32 %a, 8
166+
%val2 = and i32 %b, 8
167+
%cmp.a = icmp ne i32 %val1, 0
168+
%cmp.b = icmp ne i32 %val2, 0
169+
%and = xor i1 %cmp.b, %cmp.a
170+
ret i1 %and
171+
}
172+
173+
; Operands of and instructions, must be identical powers of 2 otherwise
174+
; a simplification, like that of previous testcase is not possible.
175+
define i1 @test9(i32 %a, i32 %b) {
176+
; CHECK-LABEL: @test9(
177+
; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 24
178+
; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 24
179+
; CHECK-NEXT: [[CMP_A:%.*]] = icmp ne i32 [[VAL1]], 0
180+
; CHECK-NEXT: [[CMP_B:%.*]] = icmp ne i32 [[VAL2]], 0
181+
; CHECK-NEXT: [[AND:%.*]] = xor i1 [[CMP_B]], [[CMP_A]]
182+
; CHECK-NEXT: ret i1 [[AND]]
183+
;
184+
%val1 = and i32 %a, 24
185+
%val2 = and i32 %b, 24
186+
%cmp.a = icmp ne i32 %val1, 0
187+
%cmp.b = icmp ne i32 %val2, 0
188+
%and = xor i1 %cmp.b, %cmp.a
189+
ret i1 %and
190+
}
191+
192+
; The last three instructions are equivalent of checking %val1 != %val2.
193+
; After making this change, other transformation further change the code.
194+
define i1 @test10(i32 %a, i32 %b) {
195+
; CHECK-LABEL: @test10(
196+
; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, %b
197+
; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 8
198+
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
199+
; CHECK-NEXT: ret i1 [[TMP3]]
200+
;
201+
%val1 = and i32 %a, 8
202+
%val2 = and i32 %b, 8
203+
%cmp.a = icmp eq i32 %val1, 0
204+
%cmp.b = icmp eq i32 %val2, 0
205+
%and = xor i1 %cmp.b, %cmp.a
206+
ret i1 %and
207+
}
208+
209+
; Cannot be simplified because of different width of %a and %b
210+
define i1 @test11(i16 %a, i32 %b) {
211+
; CHECK-LABEL: @test11(
212+
; CHECK-NEXT: [[VAL1:%.*]] = and i16 %a, 8
213+
; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 8
214+
; CHECK-NEXT: [[CMP_A:%.*]] = icmp ne i16 [[VAL1]], 0
215+
; CHECK-NEXT: [[CMP_B:%.*]] = icmp ne i32 [[VAL2]], 0
216+
; CHECK-NEXT: [[AND:%.*]] = xor i1 [[CMP_B]], [[CMP_A]]
217+
; CHECK-NEXT: ret i1 [[AND]]
218+
;
219+
%val1 = and i16 %a, 8
220+
%val2 = and i32 %b, 8
221+
%cmp.a = icmp ne i16 %val1, 0
222+
%cmp.b = icmp ne i32 %val2, 0
223+
%and = xor i1 %cmp.b, %cmp.a
224+
ret i1 %and
225+
}
226+
227+
; Similar to @test8 except that icmp instns use ugt here instead of ne.
228+
define i1 @test12(i32 %a, i32 %b) {
229+
; CHECK-LABEL: @test12(
230+
; CHECK-NEXT: [[TMP1:%.*]] = xor i32 %a, %b
231+
; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[TMP1]], 8
232+
; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
233+
; CHECK-NEXT: ret i1 [[TMP3]]
234+
;
235+
%val1 = and i32 %a, 8
236+
%val2 = and i32 %b, 8
237+
%cmp.a = icmp ugt i32 %val1, 0
238+
%cmp.b = icmp ugt i32 %val2, 0
239+
%and = xor i1 %cmp.b, %cmp.a
240+
ret i1 %and
241+
}
242+
243+
; Similar to @test3 except that the first icmp uses ugt instead of ne.
244+
define i1 @test13(i32 %a, i32 %b) {
245+
; CHECK-LABEL: @test13(
246+
; CHECK-NEXT: [[VAL1:%.*]] = and i32 %a, 8
247+
; CHECK-NEXT: [[VAL2:%.*]] = and i32 %b, 8
248+
; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]]
249+
; CHECK-NEXT: ret i1 [[TMP1]]
250+
;
251+
%val1 = and i32 %a, 8
252+
%val2 = and i32 %b, 8
253+
%cmp.a = icmp ugt i32 %val1, 0
254+
%cmp.b = icmp eq i32 %val2, 0
255+
%and = and i1 %cmp.a, %cmp.b
256+
ret i1 %and
257+
}

0 commit comments

Comments
 (0)