Skip to content

Commit 8963c99

Browse files
committed
[X86] Fold (icmp ult (add x,-C),2) -> (or (icmp eq X,C), (icmp eq X,C+1)) for Vectors
This is undoing a middle-end transform which does the opposite. Since X86 doesn't have unsigned vector comparison instructions pre-AVX512, the simplified form gets worse codegen. Fixes #66479 Proofs: https://alive2.llvm.org/ce/z/UCz3wt
1 parent 6d366f7 commit 8963c99

File tree

2 files changed

+109
-57
lines changed

2 files changed

+109
-57
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
#include <algorithm>
6565
#include <bitset>
6666
#include <cctype>
67+
#include <llvm-19/llvm/CodeGen/ISDOpcodes.h>
6768
#include <numeric>
6869
using namespace llvm;
6970

@@ -53408,6 +53409,64 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
5340853409
truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget))
5340953410
return R;
5341053411

53412+
// In the middle end transforms:
53413+
// `(or (icmp eq X, C), (icmp eq X, C+1))`
53414+
// -> `(icmp ult (add x, -C), 2)`
53415+
// Likewise inverted cases with `ugt`.
53416+
//
53417+
// Since x86, pre avx512, doesn't have unsigned vector compares, this results
53418+
// in worse codegen. So, undo the middle-end transform and go back to `(or
53419+
// (icmp eq), (icmp eq))` form.
53420+
//
53421+
// NB: We don't handle the similiar simplication of `(and (icmp ne), (icmp
53422+
// ne))` as it doesn't end up instruction positive.
53423+
// TODO: We might want to do this for avx512 as well if we `sext` the result.
53424+
if (VT.isVector() && OpVT.isVector() && OpVT.isInteger() &&
53425+
ISD::isUnsignedIntSetCC(CC) && LHS.getOpcode() == ISD::ADD &&
53426+
!Subtarget.hasAVX512() && LHS.hasOneUse()) {
53427+
53428+
APInt CmpC;
53429+
SDValue AddC = LHS.getOperand(1);
53430+
if (ISD::isConstantSplatVector(RHS.getNode(), CmpC) &&
53431+
DAG.isConstantIntBuildVectorOrConstantInt(AddC)) {
53432+
// See which form we have depending on the constant/condition.
53433+
SDValue C0 = SDValue();
53434+
SDValue C1 = SDValue();
53435+
53436+
// If we had `(add x, -1)` and can lower with `umin`, don't transform as
53437+
// we will end up generating an additional constant. Keeping in the
53438+
// current form has a slight latency cost, but it probably worth saving a
53439+
// constant.
53440+
if (ISD::isConstantSplatVectorAllOnes(AddC.getNode()) &&
53441+
DAG.getTargetLoweringInfo().isOperationLegal(ISD::UMIN, OpVT)) {
53442+
// Pass
53443+
}
53444+
// Normal Cases
53445+
else if ((CC == ISD::SETULT && CmpC == 2) ||
53446+
(CC == ISD::SETULE && CmpC == 1)) {
53447+
// These will constant fold.
53448+
C0 = DAG.getNegative(AddC, DL, OpVT);
53449+
C1 = DAG.getNode(ISD::SUB, DL, OpVT, C0,
53450+
DAG.getAllOnesConstant(DL, OpVT));
53451+
}
53452+
// Inverted Cases
53453+
else if ((CC == ISD::SETUGT && (-CmpC) == 3) ||
53454+
(CC == ISD::SETUGE && (-CmpC) == 2)) {
53455+
// These will constant fold.
53456+
C0 = DAG.getNOT(DL, AddC, OpVT);
53457+
C1 = DAG.getNode(ISD::ADD, DL, OpVT, C0,
53458+
DAG.getAllOnesConstant(DL, OpVT));
53459+
}
53460+
if (C0 && C1) {
53461+
SDValue NewLHS =
53462+
DAG.getSetCC(DL, VT, LHS.getOperand(0), C0, ISD::SETEQ);
53463+
SDValue NewRHS =
53464+
DAG.getSetCC(DL, VT, LHS.getOperand(0), C1, ISD::SETEQ);
53465+
return DAG.getNode(ISD::OR, DL, VT, NewLHS, NewRHS);
53466+
}
53467+
}
53468+
}
53469+
5341153470
// For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
5341253471
// to avoid scalarization via legalization because v4i32 is not a legal type.
5341353472
if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&

llvm/test/CodeGen/X86/eq-or-eq-range-of-2.ll

Lines changed: 50 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -17,28 +17,27 @@ define <4 x i32> @eq_or_eq_ult_2(<4 x i32> %x) {
1717
;
1818
; AVX2-LABEL: eq_or_eq_ult_2:
1919
; AVX2: # %bb.0:
20-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967291,4294967291,4294967291,4294967291]
21-
; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
22-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
23-
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
24-
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
20+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [6,6,6,6]
21+
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
22+
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [5,5,5,5]
23+
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
24+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
2525
; AVX2-NEXT: retq
2626
;
2727
; SSE41-LABEL: eq_or_eq_ult_2:
2828
; SSE41: # %bb.0:
29-
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
30-
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [1,1,1,1]
31-
; SSE41-NEXT: pminud %xmm0, %xmm1
32-
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
29+
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [6,6,6,6]
30+
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
31+
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
32+
; SSE41-NEXT: por %xmm1, %xmm0
3333
; SSE41-NEXT: retq
3434
;
3535
; SSE2-LABEL: eq_or_eq_ult_2:
3636
; SSE2: # %bb.0:
37-
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
38-
; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
39-
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483650,2147483650,2147483650,2147483650]
40-
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
41-
; SSE2-NEXT: movdqa %xmm1, %xmm0
37+
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [6,6,6,6]
38+
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
39+
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
40+
; SSE2-NEXT: por %xmm1, %xmm0
4241
; SSE2-NEXT: retq
4342
%x_adj = add <4 x i32> %x, <i32 -5, i32 -5, i32 -5, i32 -5>
4443
%cmp = icmp ult <4 x i32> %x_adj, <i32 2, i32 2, i32 2, i32 2>
@@ -75,11 +74,10 @@ define <4 x i32> @eq_or_eq_ult_2_only_transform_sse2(<4 x i32> %x) {
7574
;
7675
; SSE2-LABEL: eq_or_eq_ult_2_only_transform_sse2:
7776
; SSE2: # %bb.0:
78-
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
79-
; SSE2-NEXT: paddd %xmm0, %xmm1
80-
; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
81-
; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [2147483650,2147483650,2147483650,2147483650]
82-
; SSE2-NEXT: pcmpgtd %xmm1, %xmm0
77+
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2,2,2,2]
78+
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
79+
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
80+
; SSE2-NEXT: por %xmm1, %xmm0
8381
; SSE2-NEXT: retq
8482
%x_adj = add <4 x i32> %x, <i32 -1, i32 -1, i32 -1, i32 -1>
8583
%cmp = icmp ult <4 x i32> %x_adj, <i32 2, i32 2, i32 2, i32 2>
@@ -210,25 +208,25 @@ define <4 x i32> @eq_or_eq_ugt_m3(<4 x i32> %x) {
210208
;
211209
; AVX2-LABEL: eq_or_eq_ugt_m3:
212210
; AVX2: # %bb.0:
213-
; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
214-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967294,4294967294,4294967294,4294967294]
215-
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
216-
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
211+
; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
212+
; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
213+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
217214
; AVX2-NEXT: retq
218215
;
219216
; SSE41-LABEL: eq_or_eq_ugt_m3:
220217
; SSE41: # %bb.0:
221-
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
222-
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967294,4294967294,4294967294,4294967294]
223-
; SSE41-NEXT: pmaxud %xmm0, %xmm1
224-
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
218+
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [9,12,9,9]
219+
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
220+
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
221+
; SSE41-NEXT: por %xmm1, %xmm0
225222
; SSE41-NEXT: retq
226223
;
227224
; SSE2-LABEL: eq_or_eq_ugt_m3:
228225
; SSE2: # %bb.0:
229-
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
230-
; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
231-
; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
226+
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [9,12,9,9]
227+
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
228+
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
229+
; SSE2-NEXT: por %xmm1, %xmm0
232230
; SSE2-NEXT: retq
233231
%x_adj = add <4 x i32> %x, <i32 -11, i32 -14, i32 -11, i32 -11>
234232
%cmp = icmp ugt <4 x i32> %x_adj, <i32 -3, i32 -3, i32 -3, i32 -3>
@@ -247,27 +245,25 @@ define <4 x i32> @eq_or_eq_ule_1(<4 x i32> %x) {
247245
;
248246
; AVX2-LABEL: eq_or_eq_ule_1:
249247
; AVX2: # %bb.0:
250-
; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
251-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [1,1,1,1]
252-
; AVX2-NEXT: vpminud %xmm1, %xmm0, %xmm1
253-
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
248+
; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
249+
; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
250+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
254251
; AVX2-NEXT: retq
255252
;
256253
; SSE41-LABEL: eq_or_eq_ule_1:
257254
; SSE41: # %bb.0:
258-
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
259-
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [1,1,1,1]
260-
; SSE41-NEXT: pminud %xmm0, %xmm1
261-
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
255+
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [0,4294967295,4294967294,4294967293]
256+
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
257+
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
258+
; SSE41-NEXT: por %xmm1, %xmm0
262259
; SSE41-NEXT: retq
263260
;
264261
; SSE2-LABEL: eq_or_eq_ule_1:
265262
; SSE2: # %bb.0:
266-
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
267-
; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
268-
; SSE2-NEXT: pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
269-
; SSE2-NEXT: pcmpeqd %xmm1, %xmm1
270-
; SSE2-NEXT: pxor %xmm1, %xmm0
263+
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,4294967295,4294967294,4294967293]
264+
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
265+
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
266+
; SSE2-NEXT: por %xmm1, %xmm0
271267
; SSE2-NEXT: retq
272268
%x_adj = add <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
273269
%cmp = icmp ule <4 x i32> %x_adj, <i32 1, i32 1, i32 1, i32 1>
@@ -286,28 +282,25 @@ define <4 x i32> @eq_or_eq_uge_m2(<4 x i32> %x) {
286282
;
287283
; AVX2-LABEL: eq_or_eq_uge_m2:
288284
; AVX2: # %bb.0:
289-
; AVX2-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
290-
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm1 = [4294967294,4294967294,4294967294,4294967294]
291-
; AVX2-NEXT: vpmaxud %xmm1, %xmm0, %xmm1
292-
; AVX2-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
285+
; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
286+
; AVX2-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
287+
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
293288
; AVX2-NEXT: retq
294289
;
295290
; SSE41-LABEL: eq_or_eq_uge_m2:
296291
; SSE41: # %bb.0:
297-
; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
298-
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967294,4294967294,4294967294,4294967294]
299-
; SSE41-NEXT: pmaxud %xmm0, %xmm1
300-
; SSE41-NEXT: pcmpeqd %xmm1, %xmm0
292+
; SSE41-NEXT: pmovsxbd {{.*#+}} xmm1 = [4294967293,4294967292,4294967291,4294967290]
293+
; SSE41-NEXT: pcmpeqd %xmm0, %xmm1
294+
; SSE41-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
295+
; SSE41-NEXT: por %xmm1, %xmm0
301296
; SSE41-NEXT: retq
302297
;
303298
; SSE2-LABEL: eq_or_eq_uge_m2:
304299
; SSE2: # %bb.0:
305-
; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
306-
; SSE2-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
307-
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [2147483646,2147483646,2147483646,2147483646]
308-
; SSE2-NEXT: pcmpgtd %xmm0, %xmm1
309-
; SSE2-NEXT: pcmpeqd %xmm0, %xmm0
310-
; SSE2-NEXT: pxor %xmm1, %xmm0
300+
; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [4294967293,4294967292,4294967291,4294967290]
301+
; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
302+
; SSE2-NEXT: pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
303+
; SSE2-NEXT: por %xmm1, %xmm0
311304
; SSE2-NEXT: retq
312305
%x_adj = add <4 x i32> %x, <i32 1, i32 2, i32 3, i32 4>
313306
%cmp = icmp uge <4 x i32> %x_adj, <i32 -2, i32 -2, i32 -2, i32 -2>

0 commit comments

Comments
 (0)