Skip to content

Commit cf10fec

Browse files
committed
[DAGCombiner] Add support for scalarising extracts of a vector setcc
For IR like this: %icmp = icmp ult <4 x i32> %a, splat (i32 5) %res = extractelement <4 x i1> %icmp, i32 1 where there is only one use of %icmp we can take a similar approach to what we already do for binary ops such add, sub, etc. and convert this into %ext = extractelement <4 x i32> %a, i32 1 %res = icmp ult i32 %ext, 5 For AArch64 targets at least the scalar boolean result will almost certainly need to be in a GPR anyway, since it will probably be used by branches for control flow. I've tried to reuse existing code in scalarizeExtractedBinop to also work for setcc. NOTE: The optimisations don't apply for tests such as extract_icmp_v4i32_splat_rhs in the file CodeGen/AArch64/extract-vector-cmp.ll because scalarizeExtractedBinOp only works if one of the input operands is a constant.
1 parent 36dce93 commit cf10fec

File tree

5 files changed

+93
-124
lines changed

5 files changed

+93
-124
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3342,6 +3342,10 @@ class TargetLoweringBase {
33423342
return false;
33433343
}
33443344

3345+
/// Try to convert an extract element of a vector setcc operation into an
3346+
/// extract element followed by a scalar operation.
3347+
virtual bool shouldScalarizeSetCC(SDValue VecOp) const { return false; }
3348+
33453349
/// Return true if extraction of a scalar element from the given vector type
33463350
/// at the given index is cheap. For example, if scalar operations occur on
33473351
/// the same register file as vector operations, then an extract element may

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 50 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22743,19 +22743,15 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
2274322743

2274422744
/// Transform a vector binary operation into a scalar binary operation by moving
2274522745
/// the math/logic after an extract element of a vector.
22746-
static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
22747-
const SDLoc &DL, bool LegalOperations) {
22748-
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22746+
static bool scalarizeExtractedBinOpCommon(SDNode *ExtElt, SelectionDAG &DAG,
22747+
const SDLoc &DL, bool IsSetCC,
22748+
SDValue &ScalarOp1,
22749+
SDValue &ScalarOp2) {
2274922750
SDValue Vec = ExtElt->getOperand(0);
2275022751
SDValue Index = ExtElt->getOperand(1);
2275122752
auto *IndexC = dyn_cast<ConstantSDNode>(Index);
22752-
if (!IndexC || !TLI.isBinOp(Vec.getOpcode()) || !Vec.hasOneUse() ||
22753-
Vec->getNumValues() != 1)
22754-
return SDValue();
22755-
22756-
// Targets may want to avoid this to prevent an expensive register transfer.
22757-
if (!TLI.shouldScalarizeBinop(Vec))
22758-
return SDValue();
22753+
if (!IndexC || !Vec.hasOneUse() || Vec->getNumValues() != 1)
22754+
return false;
2275922755

2276022756
// Extracting an element of a vector constant is constant-folded, so this
2276122757
// transform is just replacing a vector op with a scalar op while moving the
@@ -22769,13 +22765,46 @@ static SDValue scalarizeExtractedBinop(SDNode *ExtElt, SelectionDAG &DAG,
2276922765
ISD::isConstantSplatVector(Op1.getNode(), SplatVal)) {
2277022766
// extractelt (binop X, C), IndexC --> binop (extractelt X, IndexC), C'
2277122767
// extractelt (binop C, X), IndexC --> binop C', (extractelt X, IndexC)
22772-
EVT VT = ExtElt->getValueType(0);
22773-
SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
22774-
SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
22775-
return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1);
22768+
// extractelt (setcc X, C, op), IndexC -> setcc (extractelt X, IndexC)), C
22769+
// extractelt (setcc C, X, op), IndexC -> setcc (extractelt IndexC, X)), C
22770+
EVT VT = Op0->getValueType(0).getVectorElementType();
22771+
ScalarOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op0, Index);
22772+
ScalarOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op1, Index);
22773+
return true;
2277622774
}
2277722775

22778-
return SDValue();
22776+
return false;
22777+
}
22778+
22779+
static SDValue scalarizeExtractedBinOp(SDNode *ExtElt, SelectionDAG &DAG,
22780+
const SDLoc &DL) {
22781+
SDValue Op1, Op2;
22782+
SDValue Vec = ExtElt->getOperand(0);
22783+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22784+
if (!TLI.isBinOp(Vec.getOpcode()) || !TLI.shouldScalarizeBinop(Vec))
22785+
return SDValue();
22786+
22787+
if (!scalarizeExtractedBinOpCommon(ExtElt, DAG, DL, false, Op1, Op2))
22788+
return SDValue();
22789+
22790+
EVT VT = ExtElt->getValueType(0);
22791+
return DAG.getNode(Vec.getOpcode(), DL, VT, Op1, Op2);
22792+
}
22793+
22794+
static SDValue scalarizeExtractedSetCC(SDNode *ExtElt, SelectionDAG &DAG,
22795+
const SDLoc &DL) {
22796+
SDValue Op1, Op2;
22797+
SDValue Vec = ExtElt->getOperand(0);
22798+
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22799+
if (Vec.getOpcode() != ISD::SETCC || !TLI.shouldScalarizeSetCC(Vec))
22800+
return SDValue();
22801+
22802+
if (!scalarizeExtractedBinOpCommon(ExtElt, DAG, DL, true, Op1, Op2))
22803+
return SDValue();
22804+
22805+
EVT VT = ExtElt->getValueType(0);
22806+
return DAG.getSetCC(DL, VT, Op1, Op2,
22807+
cast<CondCodeSDNode>(Vec->getOperand(2))->get());
2277922808
}
2278022809

2278122810
// Given a ISD::EXTRACT_VECTOR_ELT, which is a glorified bit sequence extract,
@@ -23008,9 +23037,14 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
2300823037
}
2300923038
}
2301023039

23011-
if (SDValue BO = scalarizeExtractedBinop(N, DAG, DL, LegalOperations))
23040+
if (SDValue BO = scalarizeExtractedBinOp(N, DAG, DL))
2301223041
return BO;
2301323042

23043+
// extract (setcc x, splat(y)), i -> setcc (extract x, i)), y
23044+
if (ScalarVT == VecVT.getVectorElementType())
23045+
if (SDValue SetCC = scalarizeExtractedSetCC(N, DAG, DL))
23046+
return SetCC;
23047+
2301423048
if (VecVT.isScalableVector())
2301523049
return SDValue();
2301623050

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,6 +1348,8 @@ class AArch64TargetLowering : public TargetLowering {
13481348
unsigned getMinimumJumpTableEntries() const override;
13491349

13501350
bool softPromoteHalfType() const override { return true; }
1351+
1352+
bool shouldScalarizeSetCC(SDValue VecOp) const override { return true; }
13511353
};
13521354

13531355
namespace AArch64 {

llvm/test/CodeGen/AArch64/dag-combine-concat-vectors.ll

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -8,56 +8,7 @@ declare void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8>, <vscale x
88
define fastcc i8 @allocno_reload_assign() {
99
; CHECK-LABEL: allocno_reload_assign:
1010
; CHECK: // %bb.0:
11-
; CHECK-NEXT: fmov d0, xzr
12-
; CHECK-NEXT: ptrue p0.d
13-
; CHECK-NEXT: mov z16.d, #0 // =0x0
14-
; CHECK-NEXT: cmpeq p0.d, p0/z, z0.d, #0
15-
; CHECK-NEXT: uzp1 p0.s, p0.s, p0.s
16-
; CHECK-NEXT: uzp1 p0.h, p0.h, p0.h
17-
; CHECK-NEXT: uzp1 p0.b, p0.b, p0.b
18-
; CHECK-NEXT: mov z0.b, p0/z, #1 // =0x1
19-
; CHECK-NEXT: fmov w8, s0
20-
; CHECK-NEXT: mov z0.b, #0 // =0x0
21-
; CHECK-NEXT: uunpklo z1.h, z0.b
22-
; CHECK-NEXT: uunpkhi z0.h, z0.b
23-
; CHECK-NEXT: mvn w8, w8
24-
; CHECK-NEXT: sbfx x8, x8, #0, #1
25-
; CHECK-NEXT: whilelo p0.b, xzr, x8
26-
; CHECK-NEXT: uunpklo z2.s, z1.h
27-
; CHECK-NEXT: uunpkhi z3.s, z1.h
28-
; CHECK-NEXT: uunpklo z5.s, z0.h
29-
; CHECK-NEXT: uunpkhi z7.s, z0.h
30-
; CHECK-NEXT: punpklo p1.h, p0.b
31-
; CHECK-NEXT: punpkhi p0.h, p0.b
32-
; CHECK-NEXT: punpklo p2.h, p1.b
33-
; CHECK-NEXT: punpkhi p3.h, p1.b
34-
; CHECK-NEXT: uunpklo z0.d, z2.s
35-
; CHECK-NEXT: uunpkhi z1.d, z2.s
36-
; CHECK-NEXT: punpklo p5.h, p0.b
37-
; CHECK-NEXT: uunpklo z2.d, z3.s
38-
; CHECK-NEXT: uunpkhi z3.d, z3.s
39-
; CHECK-NEXT: punpkhi p7.h, p0.b
40-
; CHECK-NEXT: uunpklo z4.d, z5.s
41-
; CHECK-NEXT: uunpkhi z5.d, z5.s
42-
; CHECK-NEXT: uunpklo z6.d, z7.s
43-
; CHECK-NEXT: uunpkhi z7.d, z7.s
44-
; CHECK-NEXT: punpklo p0.h, p2.b
45-
; CHECK-NEXT: punpkhi p1.h, p2.b
46-
; CHECK-NEXT: punpklo p2.h, p3.b
47-
; CHECK-NEXT: punpkhi p3.h, p3.b
48-
; CHECK-NEXT: punpklo p4.h, p5.b
49-
; CHECK-NEXT: punpkhi p5.h, p5.b
50-
; CHECK-NEXT: punpklo p6.h, p7.b
51-
; CHECK-NEXT: punpkhi p7.h, p7.b
5211
; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1
53-
; CHECK-NEXT: st1b { z0.d }, p0, [z16.d]
54-
; CHECK-NEXT: st1b { z1.d }, p1, [z16.d]
55-
; CHECK-NEXT: st1b { z2.d }, p2, [z16.d]
56-
; CHECK-NEXT: st1b { z3.d }, p3, [z16.d]
57-
; CHECK-NEXT: st1b { z4.d }, p4, [z16.d]
58-
; CHECK-NEXT: st1b { z5.d }, p5, [z16.d]
59-
; CHECK-NEXT: st1b { z6.d }, p6, [z16.d]
60-
; CHECK-NEXT: st1b { z7.d }, p7, [z16.d]
6112
; CHECK-NEXT: b .LBB0_1
6213
br label %1
6314

llvm/test/CodeGen/AArch64/extract-vector-cmp.ll

Lines changed: 37 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,9 @@ target triple = "aarch64-unknown-linux-gnu"
77
define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) {
88
; CHECK-LABEL: extract_icmp_v4i32_const_splat_rhs:
99
; CHECK: // %bb.0:
10-
; CHECK-NEXT: movi v1.4s, #5
11-
; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
12-
; CHECK-NEXT: xtn v0.4h, v0.4s
13-
; CHECK-NEXT: umov w8, v0.h[1]
14-
; CHECK-NEXT: and w0, w8, #0x1
10+
; CHECK-NEXT: mov w8, v0.s[1]
11+
; CHECK-NEXT: cmp w8, #5
12+
; CHECK-NEXT: cset w0, lo
1513
; CHECK-NEXT: ret
1614
%icmp = icmp ult <4 x i32> %a, splat (i32 5)
1715
%ext = extractelement <4 x i1> %icmp, i32 1
@@ -21,11 +19,9 @@ define i1 @extract_icmp_v4i32_const_splat_rhs(<4 x i32> %a) {
2119
define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) {
2220
; CHECK-LABEL: extract_icmp_v4i32_const_splat_lhs:
2321
; CHECK: // %bb.0:
24-
; CHECK-NEXT: movi v1.4s, #7
25-
; CHECK-NEXT: cmhi v0.4s, v0.4s, v1.4s
26-
; CHECK-NEXT: xtn v0.4h, v0.4s
27-
; CHECK-NEXT: umov w8, v0.h[1]
28-
; CHECK-NEXT: and w0, w8, #0x1
22+
; CHECK-NEXT: mov w8, v0.s[1]
23+
; CHECK-NEXT: cmp w8, #7
24+
; CHECK-NEXT: cset w0, hi
2925
; CHECK-NEXT: ret
3026
%icmp = icmp ult <4 x i32> splat(i32 7), %a
3127
%ext = extractelement <4 x i1> %icmp, i32 1
@@ -35,12 +31,9 @@ define i1 @extract_icmp_v4i32_const_splat_lhs(<4 x i32> %a) {
3531
define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) {
3632
; CHECK-LABEL: extract_icmp_v4i32_const_vec_rhs:
3733
; CHECK: // %bb.0:
38-
; CHECK-NEXT: adrp x8, .LCPI2_0
39-
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
40-
; CHECK-NEXT: cmhi v0.4s, v1.4s, v0.4s
41-
; CHECK-NEXT: xtn v0.4h, v0.4s
42-
; CHECK-NEXT: umov w8, v0.h[1]
43-
; CHECK-NEXT: and w0, w8, #0x1
34+
; CHECK-NEXT: mov w8, v0.s[1]
35+
; CHECK-NEXT: cmp w8, #234
36+
; CHECK-NEXT: cset w0, lo
4437
; CHECK-NEXT: ret
4538
%icmp = icmp ult <4 x i32> %a, <i32 5, i32 234, i32 -1, i32 7>
4639
%ext = extractelement <4 x i1> %icmp, i32 1
@@ -50,12 +43,10 @@ define i1 @extract_icmp_v4i32_const_vec_rhs(<4 x i32> %a) {
5043
define i1 @extract_fcmp_v4f32_const_splat_rhs(<4 x float> %a) {
5144
; CHECK-LABEL: extract_fcmp_v4f32_const_splat_rhs:
5245
; CHECK: // %bb.0:
53-
; CHECK-NEXT: fmov v1.4s, #4.00000000
54-
; CHECK-NEXT: fcmge v0.4s, v0.4s, v1.4s
55-
; CHECK-NEXT: mvn v0.16b, v0.16b
56-
; CHECK-NEXT: xtn v0.4h, v0.4s
57-
; CHECK-NEXT: umov w8, v0.h[1]
58-
; CHECK-NEXT: and w0, w8, #0x1
46+
; CHECK-NEXT: mov s0, v0.s[1]
47+
; CHECK-NEXT: fmov s1, #4.00000000
48+
; CHECK-NEXT: fcmp s0, s1
49+
; CHECK-NEXT: cset w0, lt
5950
; CHECK-NEXT: ret
6051
%fcmp = fcmp ult <4 x float> %a, splat(float 4.0e+0)
6152
%ext = extractelement <4 x i1> %fcmp, i32 1
@@ -66,66 +57,53 @@ define void @vector_loop_with_icmp(ptr nocapture noundef writeonly %dest) {
6657
; CHECK-LABEL: vector_loop_with_icmp:
6758
; CHECK: // %bb.0: // %entry
6859
; CHECK-NEXT: index z0.d, #0, #1
69-
; CHECK-NEXT: mov w8, #15 // =0xf
70-
; CHECK-NEXT: mov w9, #4 // =0x4
60+
; CHECK-NEXT: mov w8, #4 // =0x4
61+
; CHECK-NEXT: mov w9, #16 // =0x10
7162
; CHECK-NEXT: dup v2.2d, x8
72-
; CHECK-NEXT: dup v3.2d, x9
73-
; CHECK-NEXT: add x9, x0, #8
74-
; CHECK-NEXT: mov w10, #16 // =0x10
75-
; CHECK-NEXT: mov w11, #1 // =0x1
63+
; CHECK-NEXT: add x8, x0, #8
64+
; CHECK-NEXT: mov w10, #1 // =0x1
7665
; CHECK-NEXT: mov z1.d, z0.d
7766
; CHECK-NEXT: add z1.d, z1.d, #2 // =0x2
7867
; CHECK-NEXT: b .LBB4_2
7968
; CHECK-NEXT: .LBB4_1: // %pred.store.continue18
8069
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
81-
; CHECK-NEXT: add v1.2d, v1.2d, v3.2d
82-
; CHECK-NEXT: add v0.2d, v0.2d, v3.2d
83-
; CHECK-NEXT: subs x10, x10, #4
84-
; CHECK-NEXT: add x9, x9, #16
70+
; CHECK-NEXT: add v1.2d, v1.2d, v2.2d
71+
; CHECK-NEXT: add v0.2d, v0.2d, v2.2d
72+
; CHECK-NEXT: subs x9, x9, #4
73+
; CHECK-NEXT: add x8, x8, #16
8574
; CHECK-NEXT: b.eq .LBB4_10
8675
; CHECK-NEXT: .LBB4_2: // %vector.body
8776
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
88-
; CHECK-NEXT: cmhi v4.2d, v2.2d, v0.2d
89-
; CHECK-NEXT: xtn v4.2s, v4.2d
90-
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
91-
; CHECK-NEXT: umov w12, v4.h[0]
92-
; CHECK-NEXT: tbz w12, #0, .LBB4_4
77+
; CHECK-NEXT: fmov x11, d0
78+
; CHECK-NEXT: cmp x11, #14
79+
; CHECK-NEXT: b.hi .LBB4_4
9380
; CHECK-NEXT: // %bb.3: // %pred.store.if
9481
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
95-
; CHECK-NEXT: stur w11, [x9, #-8]
82+
; CHECK-NEXT: stur w10, [x8, #-8]
9683
; CHECK-NEXT: .LBB4_4: // %pred.store.continue
9784
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
98-
; CHECK-NEXT: dup v4.2d, x8
99-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v0.2d
100-
; CHECK-NEXT: xtn v4.2s, v4.2d
101-
; CHECK-NEXT: uzp1 v4.4h, v4.4h, v0.4h
102-
; CHECK-NEXT: umov w12, v4.h[1]
103-
; CHECK-NEXT: tbz w12, #0, .LBB4_6
85+
; CHECK-NEXT: mov x11, v0.d[1]
86+
; CHECK-NEXT: cmp x11, #14
87+
; CHECK-NEXT: b.hi .LBB4_6
10488
; CHECK-NEXT: // %bb.5: // %pred.store.if5
10589
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
106-
; CHECK-NEXT: stur w11, [x9, #-4]
90+
; CHECK-NEXT: stur w10, [x8, #-4]
10791
; CHECK-NEXT: .LBB4_6: // %pred.store.continue6
10892
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
109-
; CHECK-NEXT: dup v4.2d, x8
110-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v1.2d
111-
; CHECK-NEXT: xtn v4.2s, v4.2d
112-
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
113-
; CHECK-NEXT: umov w12, v4.h[2]
114-
; CHECK-NEXT: tbz w12, #0, .LBB4_8
93+
; CHECK-NEXT: fmov x11, d1
94+
; CHECK-NEXT: cmp x11, #14
95+
; CHECK-NEXT: b.hi .LBB4_8
11596
; CHECK-NEXT: // %bb.7: // %pred.store.if7
11697
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
117-
; CHECK-NEXT: str w11, [x9]
98+
; CHECK-NEXT: str w10, [x8]
11899
; CHECK-NEXT: .LBB4_8: // %pred.store.continue8
119100
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
120-
; CHECK-NEXT: dup v4.2d, x8
121-
; CHECK-NEXT: cmhi v4.2d, v4.2d, v1.2d
122-
; CHECK-NEXT: xtn v4.2s, v4.2d
123-
; CHECK-NEXT: uzp1 v4.4h, v0.4h, v4.4h
124-
; CHECK-NEXT: umov w12, v4.h[3]
125-
; CHECK-NEXT: tbz w12, #0, .LBB4_1
101+
; CHECK-NEXT: mov x11, v1.d[1]
102+
; CHECK-NEXT: cmp x11, #14
103+
; CHECK-NEXT: b.hi .LBB4_1
126104
; CHECK-NEXT: // %bb.9: // %pred.store.if9
127105
; CHECK-NEXT: // in Loop: Header=BB4_2 Depth=1
128-
; CHECK-NEXT: str w11, [x9, #4]
106+
; CHECK-NEXT: str w10, [x8, #4]
129107
; CHECK-NEXT: b .LBB4_1
130108
; CHECK-NEXT: .LBB4_10: // %for.cond.cleanup
131109
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)