Skip to content

Commit 2ed1598

Browse files
committed
[SDAG] try to reduce compare of funnel shift equal 0
fshl (or X, Y), X, C ==/!= 0 --> or (shl Y, C), X ==/!= 0 fshl X, (or X, Y), C ==/!= 0 --> or (srl Y, BW-C), X ==/!= 0 This is similar to an existing setcc-of-rotate fold, but the matching requires more checks for the more general funnel op: https://alive2.llvm.org/ce/z/Ab2jDd We are effectively decomposing the funnel shift into logical shifts, reassociating, and removing a shift. This should get us the final improvements for x86-64 that were originally shown in D111530 ( #49541 ); x86-32 still shows some SHLD/SHRD, so the pattern is not matching there yet. Differential Revision: https://reviews.llvm.org/D122919
1 parent d6cf181 commit 2ed1598

File tree

4 files changed

+120
-78
lines changed

4 files changed

+120
-78
lines changed

llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3875,6 +3875,72 @@ static SDValue foldSetCCWithRotate(EVT VT, SDValue N0, SDValue N1,
38753875
return SDValue();
38763876
}
38773877

3878+
static SDValue foldSetCCWithFunnelShift(EVT VT, SDValue N0, SDValue N1,
3879+
ISD::CondCode Cond, const SDLoc &dl,
3880+
SelectionDAG &DAG) {
3881+
// If we are testing for all-bits-clear, we might be able to do that with
3882+
// less shifting since bit-order does not matter.
3883+
if (Cond != ISD::SETEQ && Cond != ISD::SETNE)
3884+
return SDValue();
3885+
3886+
auto *C1 = isConstOrConstSplat(N1, /* AllowUndefs */ true);
3887+
if (!C1 || !C1->isZero())
3888+
return SDValue();
3889+
3890+
if (!N0.hasOneUse() ||
3891+
(N0.getOpcode() != ISD::FSHL && N0.getOpcode() != ISD::FSHR))
3892+
return SDValue();
3893+
3894+
unsigned BitWidth = N0.getScalarValueSizeInBits();
3895+
auto *ShAmtC = isConstOrConstSplat(N0.getOperand(2));
3896+
if (!ShAmtC || ShAmtC->getAPIntValue().uge(BitWidth))
3897+
return SDValue();
3898+
3899+
// Canonicalize fshr as fshl to reduce pattern-matching.
3900+
unsigned ShAmt = ShAmtC->getZExtValue();
3901+
if (N0.getOpcode() == ISD::FSHR)
3902+
ShAmt = BitWidth - ShAmt;
3903+
3904+
// Match an 'or' with a specific operand 'Other' in either commuted variant.
3905+
SDValue X, Y;
3906+
auto matchOr = [&X, &Y](SDValue Or, SDValue Other) {
3907+
if (Or.getOpcode() != ISD::OR || !Or.hasOneUse())
3908+
return false;
3909+
if (Or.getOperand(0) == Other) {
3910+
X = Or.getOperand(0);
3911+
Y = Or.getOperand(1);
3912+
return true;
3913+
}
3914+
if (Or.getOperand(1) == Other) {
3915+
X = Or.getOperand(1);
3916+
Y = Or.getOperand(0);
3917+
return true;
3918+
}
3919+
return false;
3920+
};
3921+
3922+
EVT OpVT = N0.getValueType();
3923+
EVT ShAmtVT = N0.getOperand(2).getValueType();
3924+
SDValue F0 = N0.getOperand(0);
3925+
SDValue F1 = N0.getOperand(1);
3926+
if (matchOr(F0, F1)) {
3927+
// fshl (or X, Y), X, C ==/!= 0 --> or (shl Y, C), X ==/!= 0
3928+
SDValue NewShAmt = DAG.getConstant(ShAmt, dl, ShAmtVT);
3929+
SDValue Shift = DAG.getNode(ISD::SHL, dl, OpVT, Y, NewShAmt);
3930+
SDValue NewOr = DAG.getNode(ISD::OR, dl, OpVT, Shift, X);
3931+
return DAG.getSetCC(dl, VT, NewOr, N1, Cond);
3932+
}
3933+
if (matchOr(F1, F0)) {
3934+
// fshl X, (or X, Y), C ==/!= 0 --> or (srl Y, BW-C), X ==/!= 0
3935+
SDValue NewShAmt = DAG.getConstant(BitWidth - ShAmt, dl, ShAmtVT);
3936+
SDValue Shift = DAG.getNode(ISD::SRL, dl, OpVT, Y, NewShAmt);
3937+
SDValue NewOr = DAG.getNode(ISD::OR, dl, OpVT, Shift, X);
3938+
return DAG.getSetCC(dl, VT, NewOr, N1, Cond);
3939+
}
3940+
3941+
return SDValue();
3942+
}
3943+
38783944
/// Try to simplify a setcc built with the specified operands and cc. If it is
38793945
/// unable to simplify it, return a null SDValue.
38803946
SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
@@ -3914,6 +3980,9 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
39143980
if (SDValue V = foldSetCCWithRotate(VT, N0, N1, Cond, dl, DAG))
39153981
return V;
39163982

3983+
if (SDValue V = foldSetCCWithFunnelShift(VT, N0, N1, Cond, dl, DAG))
3984+
return V;
3985+
39173986
if (auto *N1C = isConstOrConstSplat(N1)) {
39183987
const APInt &C1 = N1C->getAPIntValue();
39193988

llvm/test/CodeGen/AArch64/setcc-fsh.ll

Lines changed: 17 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,7 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
99
define i1 @fshl_or_eq_0(i32 %x, i32 %y) {
1010
; CHECK-LABEL: fshl_or_eq_0:
1111
; CHECK: // %bb.0:
12-
; CHECK-NEXT: ror w8, w0, #27
13-
; CHECK-NEXT: orr w8, w8, w1, lsl #5
12+
; CHECK-NEXT: orr w8, w0, w1, lsl #5
1413
; CHECK-NEXT: cmp w8, #0
1514
; CHECK-NEXT: cset w0, eq
1615
; CHECK-NEXT: ret
@@ -23,8 +22,7 @@ define i1 @fshl_or_eq_0(i32 %x, i32 %y) {
2322
define i1 @fshl_or_commute_eq_0(i32 %x, i32 %y) {
2423
; CHECK-LABEL: fshl_or_commute_eq_0:
2524
; CHECK: // %bb.0:
26-
; CHECK-NEXT: ror w8, w0, #27
27-
; CHECK-NEXT: orr w8, w8, w1, lsl #5
25+
; CHECK-NEXT: orr w8, w0, w1, lsl #5
2826
; CHECK-NEXT: cmp w8, #0
2927
; CHECK-NEXT: cset w0, eq
3028
; CHECK-NEXT: ret
@@ -37,10 +35,8 @@ define i1 @fshl_or_commute_eq_0(i32 %x, i32 %y) {
3735
define <4 x i1> @fshl_or2_eq_0(<4 x i32> %x, <4 x i32> %y) {
3836
; CHECK-LABEL: fshl_or2_eq_0:
3937
; CHECK: // %bb.0:
40-
; CHECK-NEXT: orr v1.16b, v0.16b, v1.16b
41-
; CHECK-NEXT: shl v0.4s, v0.4s, #25
4238
; CHECK-NEXT: ushr v1.4s, v1.4s, #7
43-
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
39+
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
4440
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
4541
; CHECK-NEXT: xtn v0.4h, v0.4s
4642
; CHECK-NEXT: ret
@@ -53,10 +49,8 @@ define <4 x i1> @fshl_or2_eq_0(<4 x i32> %x, <4 x i32> %y) {
5349
define <4 x i1> @fshl_or2_commute_eq_0(<4 x i32> %x, <4 x i32> %y) {
5450
; CHECK-LABEL: fshl_or2_commute_eq_0:
5551
; CHECK: // %bb.0:
56-
; CHECK-NEXT: orr v1.16b, v1.16b, v0.16b
57-
; CHECK-NEXT: shl v0.4s, v0.4s, #25
5852
; CHECK-NEXT: ushr v1.4s, v1.4s, #7
59-
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
53+
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
6054
; CHECK-NEXT: cmeq v0.4s, v0.4s, #0
6155
; CHECK-NEXT: xtn v0.4h, v0.4s
6256
; CHECK-NEXT: ret
@@ -69,9 +63,7 @@ define <4 x i1> @fshl_or2_commute_eq_0(<4 x i32> %x, <4 x i32> %y) {
6963
define i1 @fshr_or_eq_0(i16 %x, i16 %y) {
7064
; CHECK-LABEL: fshr_or_eq_0:
7165
; CHECK: // %bb.0:
72-
; CHECK-NEXT: lsl w8, w0, #16
73-
; CHECK-NEXT: orr w9, w0, w1
74-
; CHECK-NEXT: extr w8, w9, w8, #24
66+
; CHECK-NEXT: orr w8, w0, w1, lsl #8
7567
; CHECK-NEXT: tst w8, #0xffff
7668
; CHECK-NEXT: cset w0, eq
7769
; CHECK-NEXT: ret
@@ -84,9 +76,7 @@ define i1 @fshr_or_eq_0(i16 %x, i16 %y) {
8476
define i1 @fshr_or_commute_eq_0(i16 %x, i16 %y) {
8577
; CHECK-LABEL: fshr_or_commute_eq_0:
8678
; CHECK: // %bb.0:
87-
; CHECK-NEXT: lsl w8, w0, #16
88-
; CHECK-NEXT: orr w9, w1, w0
89-
; CHECK-NEXT: extr w8, w9, w8, #24
79+
; CHECK-NEXT: orr w8, w0, w1, lsl #8
9080
; CHECK-NEXT: tst w8, #0xffff
9181
; CHECK-NEXT: cset w0, eq
9282
; CHECK-NEXT: ret
@@ -99,8 +89,7 @@ define i1 @fshr_or_commute_eq_0(i16 %x, i16 %y) {
9989
define i1 @fshr_or2_eq_0(i64 %x, i64 %y) {
10090
; CHECK-LABEL: fshr_or2_eq_0:
10191
; CHECK: // %bb.0:
102-
; CHECK-NEXT: ror x8, x0, #3
103-
; CHECK-NEXT: orr x8, x8, x1, lsr #3
92+
; CHECK-NEXT: orr x8, x0, x1, lsr #3
10493
; CHECK-NEXT: cmp x8, #0
10594
; CHECK-NEXT: cset w0, eq
10695
; CHECK-NEXT: ret
@@ -113,8 +102,7 @@ define i1 @fshr_or2_eq_0(i64 %x, i64 %y) {
113102
define i1 @fshl_or_ne_0(i32 %x, i32 %y) {
114103
; CHECK-LABEL: fshl_or_ne_0:
115104
; CHECK: // %bb.0:
116-
; CHECK-NEXT: ror w8, w0, #25
117-
; CHECK-NEXT: orr w8, w8, w1, lsl #7
105+
; CHECK-NEXT: orr w8, w0, w1, lsl #7
118106
; CHECK-NEXT: cmp w8, #0
119107
; CHECK-NEXT: cset w0, ne
120108
; CHECK-NEXT: ret
@@ -127,8 +115,7 @@ define i1 @fshl_or_ne_0(i32 %x, i32 %y) {
127115
define i1 @fshl_or_commute_ne_0(i32 %x, i32 %y) {
128116
; CHECK-LABEL: fshl_or_commute_ne_0:
129117
; CHECK: // %bb.0:
130-
; CHECK-NEXT: ror w8, w0, #25
131-
; CHECK-NEXT: orr w8, w8, w1, lsl #7
118+
; CHECK-NEXT: orr w8, w0, w1, lsl #7
132119
; CHECK-NEXT: cmp w8, #0
133120
; CHECK-NEXT: cset w0, ne
134121
; CHECK-NEXT: ret
@@ -141,10 +128,8 @@ define i1 @fshl_or_commute_ne_0(i32 %x, i32 %y) {
141128
define <4 x i1> @fshl_or2_ne_0(<4 x i32> %x, <4 x i32> %y) {
142129
; CHECK-LABEL: fshl_or2_ne_0:
143130
; CHECK: // %bb.0:
144-
; CHECK-NEXT: orr v1.16b, v0.16b, v1.16b
145-
; CHECK-NEXT: shl v0.4s, v0.4s, #5
146131
; CHECK-NEXT: ushr v1.4s, v1.4s, #27
147-
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
132+
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
148133
; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s
149134
; CHECK-NEXT: xtn v0.4h, v0.4s
150135
; CHECK-NEXT: ret
@@ -157,10 +142,8 @@ define <4 x i1> @fshl_or2_ne_0(<4 x i32> %x, <4 x i32> %y) {
157142
define <4 x i1> @fshl_or2_commute_ne_0(<4 x i32> %x, <4 x i32> %y) {
158143
; CHECK-LABEL: fshl_or2_commute_ne_0:
159144
; CHECK: // %bb.0:
160-
; CHECK-NEXT: orr v1.16b, v1.16b, v0.16b
161-
; CHECK-NEXT: shl v0.4s, v0.4s, #5
162145
; CHECK-NEXT: ushr v1.4s, v1.4s, #27
163-
; CHECK-NEXT: orr v0.16b, v0.16b, v1.16b
146+
; CHECK-NEXT: orr v0.16b, v1.16b, v0.16b
164147
; CHECK-NEXT: cmtst v0.4s, v0.4s, v0.4s
165148
; CHECK-NEXT: xtn v0.4h, v0.4s
166149
; CHECK-NEXT: ret
@@ -173,8 +156,7 @@ define <4 x i1> @fshl_or2_commute_ne_0(<4 x i32> %x, <4 x i32> %y) {
173156
define i1 @fshr_or_ne_0(i64 %x, i64 %y) {
174157
; CHECK-LABEL: fshr_or_ne_0:
175158
; CHECK: // %bb.0:
176-
; CHECK-NEXT: orr w8, w0, w1
177-
; CHECK-NEXT: extr x8, x8, x0, #1
159+
; CHECK-NEXT: orr x8, x0, x1, lsl #63
178160
; CHECK-NEXT: cmp x8, #0
179161
; CHECK-NEXT: cset w0, ne
180162
; CHECK-NEXT: ret
@@ -187,8 +169,7 @@ define i1 @fshr_or_ne_0(i64 %x, i64 %y) {
187169
define i1 @fshr_or_commute_ne_0(i64 %x, i64 %y) {
188170
; CHECK-LABEL: fshr_or_commute_ne_0:
189171
; CHECK: // %bb.0:
190-
; CHECK-NEXT: orr w8, w1, w0
191-
; CHECK-NEXT: extr x8, x8, x0, #1
172+
; CHECK-NEXT: orr x8, x0, x1, lsl #63
192173
; CHECK-NEXT: cmp x8, #0
193174
; CHECK-NEXT: cset w0, ne
194175
; CHECK-NEXT: ret
@@ -201,9 +182,8 @@ define i1 @fshr_or_commute_ne_0(i64 %x, i64 %y) {
201182
define i1 @fshr_or2_ne_0(i16 %x, i16 %y) {
202183
; CHECK-LABEL: fshr_or2_ne_0:
203184
; CHECK: // %bb.0:
204-
; CHECK-NEXT: orr w8, w0, w1
205-
; CHECK-NEXT: lsl w8, w8, #16
206-
; CHECK-NEXT: extr w8, w0, w8, #18
185+
; CHECK-NEXT: and w8, w1, #0xfffc
186+
; CHECK-NEXT: orr w8, w0, w8, lsr #2
207187
; CHECK-NEXT: tst w8, #0xffff
208188
; CHECK-NEXT: cset w0, ne
209189
; CHECK-NEXT: ret
@@ -216,9 +196,8 @@ define i1 @fshr_or2_ne_0(i16 %x, i16 %y) {
216196
define i1 @fshr_or2_commute_ne_0(i16 %x, i16 %y) {
217197
; CHECK-LABEL: fshr_or2_commute_ne_0:
218198
; CHECK: // %bb.0:
219-
; CHECK-NEXT: orr w8, w1, w0
220-
; CHECK-NEXT: lsl w8, w8, #16
221-
; CHECK-NEXT: extr w8, w0, w8, #18
199+
; CHECK-NEXT: and w8, w1, #0xfffc
200+
; CHECK-NEXT: orr w8, w0, w8, lsr #2
222201
; CHECK-NEXT: tst w8, #0xffff
223202
; CHECK-NEXT: cset w0, ne
224203
; CHECK-NEXT: ret

llvm/test/CodeGen/X86/icmp-shift-opt.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,8 @@ define i128 @opt_setcc_lt_power_of_2(i128 %a) nounwind {
5353
; X64-NEXT: addq $1, %rax
5454
; X64-NEXT: adcq $0, %rdx
5555
; X64-NEXT: movq %rax, %rcx
56+
; X64-NEXT: shrq $60, %rcx
5657
; X64-NEXT: orq %rdx, %rcx
57-
; X64-NEXT: shrdq $60, %rdx, %rcx
5858
; X64-NEXT: jne .LBB0_1
5959
; X64-NEXT: # %bb.2: # %exit
6060
; X64-NEXT: retq
@@ -90,8 +90,8 @@ define i1 @opt_setcc_srl_eq_zero(i128 %a) nounwind {
9090
;
9191
; X64-LABEL: opt_setcc_srl_eq_zero:
9292
; X64: # %bb.0:
93+
; X64-NEXT: shrq $17, %rdi
9394
; X64-NEXT: orq %rsi, %rdi
94-
; X64-NEXT: shrdq $17, %rsi, %rdi
9595
; X64-NEXT: sete %al
9696
; X64-NEXT: retq
9797
%srl = lshr i128 %a, 17
@@ -119,8 +119,8 @@ define i1 @opt_setcc_srl_ne_zero(i128 %a) nounwind {
119119
;
120120
; X64-LABEL: opt_setcc_srl_ne_zero:
121121
; X64: # %bb.0:
122+
; X64-NEXT: shrq $17, %rdi
122123
; X64-NEXT: orq %rsi, %rdi
123-
; X64-NEXT: shrdq $17, %rsi, %rdi
124124
; X64-NEXT: setne %al
125125
; X64-NEXT: retq
126126
%srl = lshr i128 %a, 17
@@ -148,8 +148,8 @@ define i1 @opt_setcc_shl_eq_zero(i128 %a) nounwind {
148148
;
149149
; X64-LABEL: opt_setcc_shl_eq_zero:
150150
; X64: # %bb.0:
151+
; X64-NEXT: shlq $17, %rsi
151152
; X64-NEXT: orq %rdi, %rsi
152-
; X64-NEXT: shldq $17, %rdi, %rsi
153153
; X64-NEXT: sete %al
154154
; X64-NEXT: retq
155155
%shl = shl i128 %a, 17
@@ -177,8 +177,8 @@ define i1 @opt_setcc_shl_ne_zero(i128 %a) nounwind {
177177
;
178178
; X64-LABEL: opt_setcc_shl_ne_zero:
179179
; X64: # %bb.0:
180+
; X64-NEXT: shlq $17, %rsi
180181
; X64-NEXT: orq %rdi, %rsi
181-
; X64-NEXT: shldq $17, %rdi, %rsi
182182
; X64-NEXT: setne %al
183183
; X64-NEXT: retq
184184
%shl = shl i128 %a, 17
@@ -255,8 +255,8 @@ define i1 @opt_setcc_expanded_shl_correct_shifts(i64 %a, i64 %b) nounwind {
255255
;
256256
; X64-LABEL: opt_setcc_expanded_shl_correct_shifts:
257257
; X64: # %bb.0:
258+
; X64-NEXT: shlq $17, %rdi
258259
; X64-NEXT: orq %rsi, %rdi
259-
; X64-NEXT: shldq $17, %rsi, %rdi
260260
; X64-NEXT: sete %al
261261
; X64-NEXT: retq
262262
%shl.a = shl i64 %a, 17

0 commit comments

Comments
 (0)