Skip to content

Commit f24595f

Browse files
Poseydon42yuxuanchen1997
authored andcommitted
[SelectionDAG] Expand [US]CMP using arithmetic on boolean values instead of selects (#98774)
Summary: The previous expansion of [US]CMP was done using two selects and two compares. It produced decent code, but on many platforms it is better to implement [US]CMP nodes by performing the following operation: ``` [us]cmp(x, y) = (x [us]> y) - (x [us]< y) ``` This patch adds this new expansion, as well as a hook in TargetLowering to allow some targets to still use the select-based approach. AArch64 and SystemZ are currently the only targets to prefer the former approach, but other targets may also start to use it if it provides for better codegen. Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251481
1 parent a556297 commit f24595f

File tree

20 files changed

+4391
-2853
lines changed

20 files changed

+4391
-2853
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3391,6 +3391,10 @@ class TargetLoweringBase {
33913391
return isOperationLegalOrCustom(Op, VT);
33923392
}
33933393

3394+
/// Should we expand [US]CMP nodes using two selects and two compares, or by
3395+
/// doing arithmetic on boolean types
3396+
virtual bool shouldExpandCmpUsingSelects() const { return false; }
3397+
33943398
/// Does this target support complex deinterleaving
33953399
virtual bool isComplexDeinterleavingSupported() const { return false; }
33963400

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10391,14 +10391,28 @@ SDValue TargetLowering::expandCMP(SDNode *Node, SelectionDAG &DAG) const {
1039110391

1039210392
auto LTPredicate = (Opcode == ISD::UCMP ? ISD::SETULT : ISD::SETLT);
1039310393
auto GTPredicate = (Opcode == ISD::UCMP ? ISD::SETUGT : ISD::SETGT);
10394-
1039510394
SDValue IsLT = DAG.getSetCC(dl, BoolVT, LHS, RHS, LTPredicate);
1039610395
SDValue IsGT = DAG.getSetCC(dl, BoolVT, LHS, RHS, GTPredicate);
10397-
SDValue SelectZeroOrOne =
10398-
DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT),
10399-
DAG.getConstant(0, dl, ResVT));
10400-
return DAG.getSelect(dl, ResVT, IsLT, DAG.getConstant(-1, dl, ResVT),
10401-
SelectZeroOrOne);
10396+
10397+
// We can't perform arithmetic on i1 values. Extending them would
10398+
// probably result in worse codegen, so let's just use two selects instead.
10399+
// Some targets are also just better off using selects rather than subtraction
10400+
// because one of the conditions can be merged with one of the selects.
10401+
// And finally, if we don't know the contents of high bits of a boolean value
10402+
// we can't perform any arithmetic either.
10403+
if (shouldExpandCmpUsingSelects() || BoolVT.getScalarSizeInBits() == 1 ||
10404+
getBooleanContents(BoolVT) == UndefinedBooleanContent) {
10405+
SDValue SelectZeroOrOne =
10406+
DAG.getSelect(dl, ResVT, IsGT, DAG.getConstant(1, dl, ResVT),
10407+
DAG.getConstant(0, dl, ResVT));
10408+
return DAG.getSelect(dl, ResVT, IsLT, DAG.getConstant(-1, dl, ResVT),
10409+
SelectZeroOrOne);
10410+
}
10411+
10412+
if (getBooleanContents(BoolVT) == ZeroOrNegativeOneBooleanContent)
10413+
std::swap(IsGT, IsLT);
10414+
return DAG.getSExtOrTrunc(DAG.getNode(ISD::SUB, dl, BoolVT, IsGT, IsLT), dl,
10415+
ResVT);
1040210416
}
1040310417

1040410418
SDValue TargetLowering::expandShlSat(SDNode *Node, SelectionDAG &DAG) const {

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,8 @@ class AArch64TargetLowering : public TargetLowering {
907907

908908
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
909909

910+
bool shouldExpandCmpUsingSelects() const override { return true; }
911+
910912
bool isComplexDeinterleavingSupported() const override;
911913
bool isComplexDeinterleavingOperationSupported(
912914
ComplexDeinterleavingOperation Operation, Type *Ty) const override;

llvm/lib/Target/SystemZ/SystemZISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,8 @@ class SystemZTargetLowering : public TargetLowering {
507507

508508
bool shouldConsiderGEPOffsetSplit() const override { return true; }
509509

510+
bool shouldExpandCmpUsingSelects() const override { return true; }
511+
510512
const char *getTargetNodeName(unsigned Opcode) const override;
511513
std::pair<unsigned, const TargetRegisterClass *>
512514
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,

llvm/test/CodeGen/ARM/scmp.ll

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=armv7-unknown-eabi %s -o - | FileCheck %s
3+
4+
define i8 @scmp_8_8(i8 signext %x, i8 signext %y) nounwind {
5+
; CHECK-LABEL: scmp_8_8:
6+
; CHECK: @ %bb.0:
7+
; CHECK-NEXT: cmp r0, r1
8+
; CHECK-NEXT: mov r0, #0
9+
; CHECK-NEXT: mov r2, #0
10+
; CHECK-NEXT: movwlt r0, #1
11+
; CHECK-NEXT: movwgt r2, #1
12+
; CHECK-NEXT: sub r0, r2, r0
13+
; CHECK-NEXT: bx lr
14+
%1 = call i8 @llvm.scmp(i8 %x, i8 %y)
15+
ret i8 %1
16+
}
17+
18+
define i8 @scmp_8_16(i16 signext %x, i16 signext %y) nounwind {
19+
; CHECK-LABEL: scmp_8_16:
20+
; CHECK: @ %bb.0:
21+
; CHECK-NEXT: cmp r0, r1
22+
; CHECK-NEXT: mov r0, #0
23+
; CHECK-NEXT: mov r2, #0
24+
; CHECK-NEXT: movwlt r0, #1
25+
; CHECK-NEXT: movwgt r2, #1
26+
; CHECK-NEXT: sub r0, r2, r0
27+
; CHECK-NEXT: bx lr
28+
%1 = call i8 @llvm.scmp(i16 %x, i16 %y)
29+
ret i8 %1
30+
}
31+
32+
define i8 @scmp_8_32(i32 %x, i32 %y) nounwind {
33+
; CHECK-LABEL: scmp_8_32:
34+
; CHECK: @ %bb.0:
35+
; CHECK-NEXT: cmp r0, r1
36+
; CHECK-NEXT: mov r0, #0
37+
; CHECK-NEXT: mov r2, #0
38+
; CHECK-NEXT: movwlt r0, #1
39+
; CHECK-NEXT: movwgt r2, #1
40+
; CHECK-NEXT: sub r0, r2, r0
41+
; CHECK-NEXT: bx lr
42+
%1 = call i8 @llvm.scmp(i32 %x, i32 %y)
43+
ret i8 %1
44+
}
45+
46+
define i8 @scmp_8_64(i64 %x, i64 %y) nounwind {
47+
; CHECK-LABEL: scmp_8_64:
48+
; CHECK: @ %bb.0:
49+
; CHECK-NEXT: .save {r11, lr}
50+
; CHECK-NEXT: push {r11, lr}
51+
; CHECK-NEXT: subs lr, r0, r2
52+
; CHECK-NEXT: mov r12, #0
53+
; CHECK-NEXT: sbcs lr, r1, r3
54+
; CHECK-NEXT: mov lr, #0
55+
; CHECK-NEXT: movwlt lr, #1
56+
; CHECK-NEXT: subs r0, r2, r0
57+
; CHECK-NEXT: sbcs r0, r3, r1
58+
; CHECK-NEXT: movwlt r12, #1
59+
; CHECK-NEXT: sub r0, r12, lr
60+
; CHECK-NEXT: pop {r11, pc}
61+
%1 = call i8 @llvm.scmp(i64 %x, i64 %y)
62+
ret i8 %1
63+
}
64+
65+
define i8 @scmp_8_128(i128 %x, i128 %y) nounwind {
66+
; CHECK-LABEL: scmp_8_128:
67+
; CHECK: @ %bb.0:
68+
; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
69+
; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
70+
; CHECK-NEXT: ldr r4, [sp, #24]
71+
; CHECK-NEXT: mov r5, #0
72+
; CHECK-NEXT: ldr r6, [sp, #28]
73+
; CHECK-NEXT: subs r7, r0, r4
74+
; CHECK-NEXT: ldr r12, [sp, #32]
75+
; CHECK-NEXT: sbcs r7, r1, r6
76+
; CHECK-NEXT: ldr lr, [sp, #36]
77+
; CHECK-NEXT: sbcs r7, r2, r12
78+
; CHECK-NEXT: sbcs r7, r3, lr
79+
; CHECK-NEXT: mov r7, #0
80+
; CHECK-NEXT: movwlt r7, #1
81+
; CHECK-NEXT: subs r0, r4, r0
82+
; CHECK-NEXT: sbcs r0, r6, r1
83+
; CHECK-NEXT: sbcs r0, r12, r2
84+
; CHECK-NEXT: sbcs r0, lr, r3
85+
; CHECK-NEXT: movwlt r5, #1
86+
; CHECK-NEXT: sub r0, r5, r7
87+
; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc}
88+
%1 = call i8 @llvm.scmp(i128 %x, i128 %y)
89+
ret i8 %1
90+
}
91+
92+
define i32 @scmp_32_32(i32 %x, i32 %y) nounwind {
93+
; CHECK-LABEL: scmp_32_32:
94+
; CHECK: @ %bb.0:
95+
; CHECK-NEXT: cmp r0, r1
96+
; CHECK-NEXT: mov r0, #0
97+
; CHECK-NEXT: mov r2, #0
98+
; CHECK-NEXT: movwlt r0, #1
99+
; CHECK-NEXT: movwgt r2, #1
100+
; CHECK-NEXT: sub r0, r2, r0
101+
; CHECK-NEXT: bx lr
102+
%1 = call i32 @llvm.scmp(i32 %x, i32 %y)
103+
ret i32 %1
104+
}
105+
106+
define i32 @scmp_32_64(i64 %x, i64 %y) nounwind {
107+
; CHECK-LABEL: scmp_32_64:
108+
; CHECK: @ %bb.0:
109+
; CHECK-NEXT: .save {r11, lr}
110+
; CHECK-NEXT: push {r11, lr}
111+
; CHECK-NEXT: subs lr, r0, r2
112+
; CHECK-NEXT: mov r12, #0
113+
; CHECK-NEXT: sbcs lr, r1, r3
114+
; CHECK-NEXT: mov lr, #0
115+
; CHECK-NEXT: movwlt lr, #1
116+
; CHECK-NEXT: subs r0, r2, r0
117+
; CHECK-NEXT: sbcs r0, r3, r1
118+
; CHECK-NEXT: movwlt r12, #1
119+
; CHECK-NEXT: sub r0, r12, lr
120+
; CHECK-NEXT: pop {r11, pc}
121+
%1 = call i32 @llvm.scmp(i64 %x, i64 %y)
122+
ret i32 %1
123+
}
124+
125+
define i64 @scmp_64_64(i64 %x, i64 %y) nounwind {
126+
; CHECK-LABEL: scmp_64_64:
127+
; CHECK: @ %bb.0:
128+
; CHECK-NEXT: .save {r11, lr}
129+
; CHECK-NEXT: push {r11, lr}
130+
; CHECK-NEXT: subs lr, r0, r2
131+
; CHECK-NEXT: mov r12, #0
132+
; CHECK-NEXT: sbcs lr, r1, r3
133+
; CHECK-NEXT: mov lr, #0
134+
; CHECK-NEXT: movwlt lr, #1
135+
; CHECK-NEXT: subs r0, r2, r0
136+
; CHECK-NEXT: sbcs r0, r3, r1
137+
; CHECK-NEXT: movwlt r12, #1
138+
; CHECK-NEXT: sub r0, r12, lr
139+
; CHECK-NEXT: asr r1, r0, #31
140+
; CHECK-NEXT: pop {r11, pc}
141+
%1 = call i64 @llvm.scmp(i64 %x, i64 %y)
142+
ret i64 %1
143+
}

llvm/test/CodeGen/ARM/ucmp.ll

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=armv7-unknown-eabi %s -o - | FileCheck %s
3+
4+
define i8 @ucmp_8_8(i8 zeroext %x, i8 zeroext %y) nounwind {
5+
; CHECK-LABEL: ucmp_8_8:
6+
; CHECK: @ %bb.0:
7+
; CHECK-NEXT: cmp r0, r1
8+
; CHECK-NEXT: mov r0, #0
9+
; CHECK-NEXT: mov r2, #0
10+
; CHECK-NEXT: movwlo r0, #1
11+
; CHECK-NEXT: movwhi r2, #1
12+
; CHECK-NEXT: sub r0, r2, r0
13+
; CHECK-NEXT: bx lr
14+
%1 = call i8 @llvm.ucmp(i8 %x, i8 %y)
15+
ret i8 %1
16+
}
17+
18+
define i8 @ucmp_8_16(i16 zeroext %x, i16 zeroext %y) nounwind {
19+
; CHECK-LABEL: ucmp_8_16:
20+
; CHECK: @ %bb.0:
21+
; CHECK-NEXT: cmp r0, r1
22+
; CHECK-NEXT: mov r0, #0
23+
; CHECK-NEXT: mov r2, #0
24+
; CHECK-NEXT: movwlo r0, #1
25+
; CHECK-NEXT: movwhi r2, #1
26+
; CHECK-NEXT: sub r0, r2, r0
27+
; CHECK-NEXT: bx lr
28+
%1 = call i8 @llvm.ucmp(i16 %x, i16 %y)
29+
ret i8 %1
30+
}
31+
32+
define i8 @ucmp_8_32(i32 %x, i32 %y) nounwind {
33+
; CHECK-LABEL: ucmp_8_32:
34+
; CHECK: @ %bb.0:
35+
; CHECK-NEXT: cmp r0, r1
36+
; CHECK-NEXT: mov r0, #0
37+
; CHECK-NEXT: mov r2, #0
38+
; CHECK-NEXT: movwlo r0, #1
39+
; CHECK-NEXT: movwhi r2, #1
40+
; CHECK-NEXT: sub r0, r2, r0
41+
; CHECK-NEXT: bx lr
42+
%1 = call i8 @llvm.ucmp(i32 %x, i32 %y)
43+
ret i8 %1
44+
}
45+
46+
define i8 @ucmp_8_64(i64 %x, i64 %y) nounwind {
47+
; CHECK-LABEL: ucmp_8_64:
48+
; CHECK: @ %bb.0:
49+
; CHECK-NEXT: .save {r11, lr}
50+
; CHECK-NEXT: push {r11, lr}
51+
; CHECK-NEXT: subs lr, r0, r2
52+
; CHECK-NEXT: mov r12, #0
53+
; CHECK-NEXT: sbcs lr, r1, r3
54+
; CHECK-NEXT: mov lr, #0
55+
; CHECK-NEXT: movwlo lr, #1
56+
; CHECK-NEXT: subs r0, r2, r0
57+
; CHECK-NEXT: sbcs r0, r3, r1
58+
; CHECK-NEXT: movwlo r12, #1
59+
; CHECK-NEXT: sub r0, r12, lr
60+
; CHECK-NEXT: pop {r11, pc}
61+
%1 = call i8 @llvm.ucmp(i64 %x, i64 %y)
62+
ret i8 %1
63+
}
64+
65+
define i8 @ucmp_8_128(i128 %x, i128 %y) nounwind {
66+
; CHECK-LABEL: ucmp_8_128:
67+
; CHECK: @ %bb.0:
68+
; CHECK-NEXT: .save {r4, r5, r6, r7, r11, lr}
69+
; CHECK-NEXT: push {r4, r5, r6, r7, r11, lr}
70+
; CHECK-NEXT: ldr r4, [sp, #24]
71+
; CHECK-NEXT: mov r5, #0
72+
; CHECK-NEXT: ldr r6, [sp, #28]
73+
; CHECK-NEXT: subs r7, r0, r4
74+
; CHECK-NEXT: ldr r12, [sp, #32]
75+
; CHECK-NEXT: sbcs r7, r1, r6
76+
; CHECK-NEXT: ldr lr, [sp, #36]
77+
; CHECK-NEXT: sbcs r7, r2, r12
78+
; CHECK-NEXT: sbcs r7, r3, lr
79+
; CHECK-NEXT: mov r7, #0
80+
; CHECK-NEXT: movwlo r7, #1
81+
; CHECK-NEXT: subs r0, r4, r0
82+
; CHECK-NEXT: sbcs r0, r6, r1
83+
; CHECK-NEXT: sbcs r0, r12, r2
84+
; CHECK-NEXT: sbcs r0, lr, r3
85+
; CHECK-NEXT: movwlo r5, #1
86+
; CHECK-NEXT: sub r0, r5, r7
87+
; CHECK-NEXT: pop {r4, r5, r6, r7, r11, pc}
88+
%1 = call i8 @llvm.ucmp(i128 %x, i128 %y)
89+
ret i8 %1
90+
}
91+
92+
define i32 @ucmp_32_32(i32 %x, i32 %y) nounwind {
93+
; CHECK-LABEL: ucmp_32_32:
94+
; CHECK: @ %bb.0:
95+
; CHECK-NEXT: cmp r0, r1
96+
; CHECK-NEXT: mov r0, #0
97+
; CHECK-NEXT: mov r2, #0
98+
; CHECK-NEXT: movwlo r0, #1
99+
; CHECK-NEXT: movwhi r2, #1
100+
; CHECK-NEXT: sub r0, r2, r0
101+
; CHECK-NEXT: bx lr
102+
%1 = call i32 @llvm.ucmp(i32 %x, i32 %y)
103+
ret i32 %1
104+
}
105+
106+
define i32 @ucmp_32_64(i64 %x, i64 %y) nounwind {
107+
; CHECK-LABEL: ucmp_32_64:
108+
; CHECK: @ %bb.0:
109+
; CHECK-NEXT: .save {r11, lr}
110+
; CHECK-NEXT: push {r11, lr}
111+
; CHECK-NEXT: subs lr, r0, r2
112+
; CHECK-NEXT: mov r12, #0
113+
; CHECK-NEXT: sbcs lr, r1, r3
114+
; CHECK-NEXT: mov lr, #0
115+
; CHECK-NEXT: movwlo lr, #1
116+
; CHECK-NEXT: subs r0, r2, r0
117+
; CHECK-NEXT: sbcs r0, r3, r1
118+
; CHECK-NEXT: movwlo r12, #1
119+
; CHECK-NEXT: sub r0, r12, lr
120+
; CHECK-NEXT: pop {r11, pc}
121+
%1 = call i32 @llvm.ucmp(i64 %x, i64 %y)
122+
ret i32 %1
123+
}
124+
125+
define i64 @ucmp_64_64(i64 %x, i64 %y) nounwind {
126+
; CHECK-LABEL: ucmp_64_64:
127+
; CHECK: @ %bb.0:
128+
; CHECK-NEXT: .save {r11, lr}
129+
; CHECK-NEXT: push {r11, lr}
130+
; CHECK-NEXT: subs lr, r0, r2
131+
; CHECK-NEXT: mov r12, #0
132+
; CHECK-NEXT: sbcs lr, r1, r3
133+
; CHECK-NEXT: mov lr, #0
134+
; CHECK-NEXT: movwlo lr, #1
135+
; CHECK-NEXT: subs r0, r2, r0
136+
; CHECK-NEXT: sbcs r0, r3, r1
137+
; CHECK-NEXT: movwlo r12, #1
138+
; CHECK-NEXT: sub r0, r12, lr
139+
; CHECK-NEXT: asr r1, r0, #31
140+
; CHECK-NEXT: pop {r11, pc}
141+
%1 = call i64 @llvm.ucmp(i64 %x, i64 %y)
142+
ret i64 %1
143+
}

0 commit comments

Comments
 (0)