Skip to content

Commit 3250317

Browse files
committed
[SelectionDAG] Optimize expansion for rotates/funnel shifts
If the type of a funnel shift needs to be expanded, expand it to two funnel shifts instead of regular shifts. For constant shifts, this doesn't make much difference, but for variable shifts it allows a more optimal lowering. Also use the optimized funnel shift lowering for rotates. Alive2: https://alive2.llvm.org/ce/z/TvHDB- / https://alive2.llvm.org/ce/z/yzPept (Branched from D108058 as getting this completed should help unlock some other WIP patches). Original Patch: @efriedma (Eli Friedman) Differential Revision: https://reviews.llvm.org/D112443
1 parent 37e17f2 commit 3250317

File tree

15 files changed

+1276
-3085
lines changed

15 files changed

+1276
-3085
lines changed

llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4377,18 +4377,45 @@ void DAGTypeLegalizer::ExpandIntRes_VECREDUCE(SDNode *N,
43774377

43784378
void DAGTypeLegalizer::ExpandIntRes_Rotate(SDNode *N,
43794379
SDValue &Lo, SDValue &Hi) {
4380-
// Lower the rotate to shifts and ORs which can be expanded.
4381-
SDValue Res;
4382-
TLI.expandROT(N, true /*AllowVectorOps*/, Res, DAG);
4380+
// Delegate to funnel-shift expansion.
4381+
SDLoc DL(N);
4382+
unsigned Opcode = N->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
4383+
SDValue Res = DAG.getNode(Opcode, DL, N->getValueType(0), N->getOperand(0),
4384+
N->getOperand(0), N->getOperand(1));
43834385
SplitInteger(Res, Lo, Hi);
43844386
}
43854387

4386-
void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N,
4387-
SDValue &Lo, SDValue &Hi) {
4388-
// Lower the funnel shift to shifts and ORs which can be expanded.
4389-
SDValue Res;
4390-
TLI.expandFunnelShift(N, Res, DAG);
4391-
SplitInteger(Res, Lo, Hi);
4388+
void DAGTypeLegalizer::ExpandIntRes_FunnelShift(SDNode *N, SDValue &Lo,
4389+
SDValue &Hi) {
4390+
// Values numbered from least significant to most significant.
4391+
SDValue In1, In2, In3, In4;
4392+
GetExpandedInteger(N->getOperand(0), In3, In4);
4393+
GetExpandedInteger(N->getOperand(1), In1, In2);
4394+
EVT HalfVT = In1.getValueType();
4395+
4396+
SDLoc DL(N);
4397+
unsigned Opc = N->getOpcode();
4398+
SDValue ShAmt = N->getOperand(2);
4399+
EVT ShAmtVT = ShAmt.getValueType();
4400+
EVT ShAmtCCVT = getSetCCResultType(ShAmtVT);
4401+
4402+
// If the shift amount is at least half the bitwidth, swap the inputs.
4403+
unsigned HalfVTBits = HalfVT.getScalarSizeInBits();
4404+
SDValue AndNode = DAG.getNode(ISD::AND, DL, ShAmtVT, ShAmt,
4405+
DAG.getConstant(HalfVTBits, DL, ShAmtVT));
4406+
SDValue Cond =
4407+
DAG.getSetCC(DL, ShAmtCCVT, AndNode, DAG.getConstant(0, DL, ShAmtVT),
4408+
Opc == ISD::FSHL ? ISD::SETNE : ISD::SETEQ);
4409+
4410+
// Expand to a pair of funnel shifts.
4411+
EVT NewShAmtVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
4412+
SDValue NewShAmt = DAG.getAnyExtOrTrunc(ShAmt, DL, NewShAmtVT);
4413+
4414+
SDValue Select1 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In1, In2);
4415+
SDValue Select2 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In2, In3);
4416+
SDValue Select3 = DAG.getNode(ISD::SELECT, DL, HalfVT, Cond, In3, In4);
4417+
Lo = DAG.getNode(Opc, DL, HalfVT, Select2, Select1, NewShAmt);
4418+
Hi = DAG.getNode(Opc, DL, HalfVT, Select3, Select2, NewShAmt);
43924419
}
43934420

43944421
void DAGTypeLegalizer::ExpandIntRes_VSCALE(SDNode *N, SDValue &Lo,

llvm/test/CodeGen/AArch64/funnel-shift.ll

Lines changed: 10 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -46,29 +46,19 @@ define i64 @fshl_i64(i64 %x, i64 %y, i64 %z) {
4646
define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
4747
; CHECK-LABEL: fshl_i128:
4848
; CHECK: // %bb.0:
49+
; CHECK-NEXT: tst x4, #0x40
4950
; CHECK-NEXT: mvn w8, w4
50-
; CHECK-NEXT: extr x9, x3, x2, #1
51-
; CHECK-NEXT: lsr x10, x3, #1
52-
; CHECK-NEXT: and x12, x8, #0x7f
53-
; CHECK-NEXT: lsl x11, x10, #1
54-
; CHECK-NEXT: tst x12, #0x40
55-
; CHECK-NEXT: lsl x11, x11, x4
51+
; CHECK-NEXT: csel x9, x2, x3, ne
52+
; CHECK-NEXT: csel x10, x3, x0, ne
53+
; CHECK-NEXT: lsr x9, x9, #1
54+
; CHECK-NEXT: lsl x11, x10, x4
55+
; CHECK-NEXT: csel x12, x0, x1, ne
56+
; CHECK-NEXT: lsr x10, x10, #1
5657
; CHECK-NEXT: lsr x9, x9, x8
57-
; CHECK-NEXT: orr x9, x11, x9
58-
; CHECK-NEXT: lsr x11, x0, #1
59-
; CHECK-NEXT: lsr x10, x10, x8
60-
; CHECK-NEXT: lsl x12, x1, x4
61-
; CHECK-NEXT: lsr x8, x11, x8
62-
; CHECK-NEXT: and x11, x4, #0x7f
63-
; CHECK-NEXT: csel x9, x10, x9, ne
64-
; CHECK-NEXT: csel x10, xzr, x10, ne
65-
; CHECK-NEXT: orr x8, x12, x8
66-
; CHECK-NEXT: lsl x12, x0, x4
67-
; CHECK-NEXT: tst x11, #0x40
68-
; CHECK-NEXT: csel x8, x12, x8, ne
69-
; CHECK-NEXT: csel x11, xzr, x12, ne
70-
; CHECK-NEXT: orr x1, x8, x10
58+
; CHECK-NEXT: lsl x12, x12, x4
59+
; CHECK-NEXT: lsr x8, x10, x8
7160
; CHECK-NEXT: orr x0, x11, x9
61+
; CHECK-NEXT: orr x1, x12, x8
7262
; CHECK-NEXT: ret
7363
%f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
7464
ret i128 %f

llvm/test/CodeGen/ARM/funnel-shift-rot.ll

Lines changed: 33 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -67,61 +67,24 @@ define i32 @rotl_i32(i32 %x, i32 %z) {
6767
}
6868

6969
define i64 @rotl_i64(i64 %x, i64 %z) {
70-
; SCALAR-LABEL: rotl_i64:
71-
; SCALAR: @ %bb.0:
72-
; SCALAR-NEXT: .save {r4, r5, r11, lr}
73-
; SCALAR-NEXT: push {r4, r5, r11, lr}
74-
; SCALAR-NEXT: rsb r3, r2, #0
75-
; SCALAR-NEXT: and r4, r2, #63
76-
; SCALAR-NEXT: and lr, r3, #63
77-
; SCALAR-NEXT: rsb r3, lr, #32
78-
; SCALAR-NEXT: lsl r2, r0, r4
79-
; SCALAR-NEXT: lsr r12, r0, lr
80-
; SCALAR-NEXT: orr r3, r12, r1, lsl r3
81-
; SCALAR-NEXT: subs r12, lr, #32
82-
; SCALAR-NEXT: lsrpl r3, r1, r12
83-
; SCALAR-NEXT: subs r5, r4, #32
84-
; SCALAR-NEXT: movwpl r2, #0
85-
; SCALAR-NEXT: cmp r5, #0
86-
; SCALAR-NEXT: orr r2, r2, r3
87-
; SCALAR-NEXT: rsb r3, r4, #32
88-
; SCALAR-NEXT: lsr r3, r0, r3
89-
; SCALAR-NEXT: orr r3, r3, r1, lsl r4
90-
; SCALAR-NEXT: lslpl r3, r0, r5
91-
; SCALAR-NEXT: lsr r0, r1, lr
92-
; SCALAR-NEXT: cmp r12, #0
93-
; SCALAR-NEXT: movwpl r0, #0
94-
; SCALAR-NEXT: orr r1, r3, r0
95-
; SCALAR-NEXT: mov r0, r2
96-
; SCALAR-NEXT: pop {r4, r5, r11, pc}
97-
;
98-
; NEON-LABEL: rotl_i64:
99-
; NEON: @ %bb.0:
100-
; NEON-NEXT: .save {r4, r5, r11, lr}
101-
; NEON-NEXT: push {r4, r5, r11, lr}
102-
; NEON-NEXT: and r12, r2, #63
103-
; NEON-NEXT: rsb r2, r2, #0
104-
; NEON-NEXT: rsb r3, r12, #32
105-
; NEON-NEXT: and r4, r2, #63
106-
; NEON-NEXT: subs lr, r12, #32
107-
; NEON-NEXT: lsr r3, r0, r3
108-
; NEON-NEXT: lsr r2, r1, r4
109-
; NEON-NEXT: orr r3, r3, r1, lsl r12
110-
; NEON-NEXT: lslpl r3, r0, lr
111-
; NEON-NEXT: subs r5, r4, #32
112-
; NEON-NEXT: movwpl r2, #0
113-
; NEON-NEXT: cmp r5, #0
114-
; NEON-NEXT: orr r2, r3, r2
115-
; NEON-NEXT: lsr r3, r0, r4
116-
; NEON-NEXT: rsb r4, r4, #32
117-
; NEON-NEXT: lsl r0, r0, r12
118-
; NEON-NEXT: orr r3, r3, r1, lsl r4
119-
; NEON-NEXT: lsrpl r3, r1, r5
120-
; NEON-NEXT: cmp lr, #0
121-
; NEON-NEXT: movwpl r0, #0
122-
; NEON-NEXT: mov r1, r2
123-
; NEON-NEXT: orr r0, r0, r3
124-
; NEON-NEXT: pop {r4, r5, r11, pc}
70+
; CHECK-LABEL: rotl_i64:
71+
; CHECK: @ %bb.0:
72+
; CHECK-NEXT: .save {r4, lr}
73+
; CHECK-NEXT: push {r4, lr}
74+
; CHECK-NEXT: ands r3, r2, #32
75+
; CHECK-NEXT: and r12, r2, #31
76+
; CHECK-NEXT: mov r3, r0
77+
; CHECK-NEXT: mov r4, #31
78+
; CHECK-NEXT: movne r3, r1
79+
; CHECK-NEXT: movne r1, r0
80+
; CHECK-NEXT: bic r2, r4, r2
81+
; CHECK-NEXT: lsl lr, r3, r12
82+
; CHECK-NEXT: lsr r0, r1, #1
83+
; CHECK-NEXT: lsl r1, r1, r12
84+
; CHECK-NEXT: lsr r3, r3, #1
85+
; CHECK-NEXT: orr r0, lr, r0, lsr r2
86+
; CHECK-NEXT: orr r1, r1, r3, lsr r2
87+
; CHECK-NEXT: pop {r4, pc}
12588
%f = call i64 @llvm.fshl.i64(i64 %x, i64 %x, i64 %z)
12689
ret i64 %f
12790
}
@@ -243,31 +206,21 @@ define i32 @rotr_i32(i32 %x, i32 %z) {
243206
define i64 @rotr_i64(i64 %x, i64 %z) {
244207
; CHECK-LABEL: rotr_i64:
245208
; CHECK: @ %bb.0:
246-
; CHECK-NEXT: .save {r4, r5, r11, lr}
247-
; CHECK-NEXT: push {r4, r5, r11, lr}
248-
; CHECK-NEXT: and lr, r2, #63
249-
; CHECK-NEXT: rsb r2, r2, #0
250-
; CHECK-NEXT: rsb r3, lr, #32
251-
; CHECK-NEXT: and r4, r2, #63
252-
; CHECK-NEXT: lsr r12, r0, lr
253-
; CHECK-NEXT: orr r3, r12, r1, lsl r3
254-
; CHECK-NEXT: subs r12, lr, #32
255-
; CHECK-NEXT: lsl r2, r0, r4
256-
; CHECK-NEXT: lsrpl r3, r1, r12
257-
; CHECK-NEXT: subs r5, r4, #32
258-
; CHECK-NEXT: movwpl r2, #0
259-
; CHECK-NEXT: cmp r5, #0
260-
; CHECK-NEXT: orr r2, r3, r2
261-
; CHECK-NEXT: rsb r3, r4, #32
262-
; CHECK-NEXT: lsr r3, r0, r3
263-
; CHECK-NEXT: orr r3, r3, r1, lsl r4
264-
; CHECK-NEXT: lslpl r3, r0, r5
265-
; CHECK-NEXT: lsr r0, r1, lr
266-
; CHECK-NEXT: cmp r12, #0
267-
; CHECK-NEXT: movwpl r0, #0
268-
; CHECK-NEXT: orr r1, r0, r3
269-
; CHECK-NEXT: mov r0, r2
270-
; CHECK-NEXT: pop {r4, r5, r11, pc}
209+
; CHECK-NEXT: ands r3, r2, #32
210+
; CHECK-NEXT: mov r3, r1
211+
; CHECK-NEXT: moveq r3, r0
212+
; CHECK-NEXT: moveq r0, r1
213+
; CHECK-NEXT: mov r1, #31
214+
; CHECK-NEXT: lsl r12, r0, #1
215+
; CHECK-NEXT: bic r1, r1, r2
216+
; CHECK-NEXT: and r2, r2, #31
217+
; CHECK-NEXT: lsl r12, r12, r1
218+
; CHECK-NEXT: orr r12, r12, r3, lsr r2
219+
; CHECK-NEXT: lsl r3, r3, #1
220+
; CHECK-NEXT: lsl r1, r3, r1
221+
; CHECK-NEXT: orr r1, r1, r0, lsr r2
222+
; CHECK-NEXT: mov r0, r12
223+
; CHECK-NEXT: bx lr
271224
%f = call i64 @llvm.fshr.i64(i64 %x, i64 %x, i64 %z)
272225
ret i64 %f
273226
}

0 commit comments

Comments
 (0)