Skip to content

Commit 97ff961

Browse files
authored
[AArch64] Improve code generation of bool vector reduce operations (#115713)
* Avoid unnecessary truncation of comparison results in vecreduce_xor * Optimize generated code for vecreduce_and and vecreduce_or by comparing against 0.0 to check if all/any of the values are set Alive2 proof of vecreduce_and and vecreduce_or transformation: https://alive2.llvm.org/ce/z/SRfPtw
1 parent eacdbc2 commit 97ff961

9 files changed

+760
-127
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 47 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15841,11 +15841,27 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
1584115841
return getVectorBitwiseReduce(Opcode, HalfVec, VT, DL, DAG);
1584215842
}
1584315843

15844-
// Vectors that are less than 64 bits get widened to neatly fit a 64 bit
15845-
// register, so e.g. <4 x i1> gets lowered to <4 x i16>. Sign extending to
15844+
// Results of setcc operations get widened to 128 bits for xor reduce if
15845+
// their input operands are 128 bits wide, otherwise vectors that are less
15846+
// than 64 bits get widened to neatly fit a 64 bit register, so e.g.
15847+
// <4 x i1> gets lowered to either <4 x i16> or <4 x i32>. Sign extending to
1584615848
// this element size leads to the best codegen, since e.g. setcc results
1584715849
// might need to be truncated otherwise.
15848-
EVT ExtendedVT = MVT::getIntegerVT(std::max(64u / NumElems, 8u));
15850+
unsigned ExtendedWidth = 64;
15851+
if (ScalarOpcode == ISD::XOR && Vec.getOpcode() == ISD::SETCC &&
15852+
Vec.getOperand(0).getValueSizeInBits() >= 128) {
15853+
ExtendedWidth = 128;
15854+
}
15855+
EVT ExtendedVT = MVT::getIntegerVT(std::max(ExtendedWidth / NumElems, 8u));
15856+
15857+
// Negate the reduced vector value for reduce and operations that use
15858+
// fcmp.
15859+
if (ScalarOpcode == ISD::AND && NumElems < 16) {
15860+
Vec = DAG.getNode(
15861+
ISD::XOR, DL, VecVT, Vec,
15862+
DAG.getSplatVector(
15863+
VecVT, DL, DAG.getConstant(APInt::getAllOnes(32), DL, MVT::i32)));
15864+
}
1584915865

1585015866
// any_ext doesn't work with umin/umax, so only use it for uadd.
1585115867
unsigned ExtendOp =
@@ -15854,10 +15870,36 @@ static SDValue getVectorBitwiseReduce(unsigned Opcode, SDValue Vec, EVT VT,
1585415870
ExtendOp, DL, VecVT.changeVectorElementType(ExtendedVT), Vec);
1585515871
switch (ScalarOpcode) {
1585615872
case ISD::AND:
15857-
Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
15873+
if (NumElems < 16) {
15874+
// Check if all lanes of the negated bool vector value are zero by
15875+
// comparing against 0.0 with ordered and equal predicate. The only
15876+
// non-zero bit pattern that compares ordered and equal to 0.0 is -0.0,
15877+
// where only the sign bit is set. However the bool vector is
15878+
// sign-extended so that each bit in a lane is either zero or one,
15879+
// meaning that it is impossible to get the bit pattern of -0.0.
15880+
assert(Extended.getValueSizeInBits() == 64);
15881+
Extended = DAG.getBitcast(MVT::f64, Extended);
15882+
Result =
15883+
DAG.getSetCC(DL, MVT::i32, Extended,
15884+
DAG.getConstantFP(0.0, DL, MVT::f64), ISD::SETOEQ);
15885+
} else {
15886+
Result = DAG.getNode(ISD::VECREDUCE_UMIN, DL, ExtendedVT, Extended);
15887+
}
1585815888
break;
1585915889
case ISD::OR:
15860-
Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
15890+
if (NumElems < 16) {
15891+
// Check if any lane of the bool vector is set by comparing against 0.0.
15892+
// NaN bit patterns are handled by using the 'unordered or not equal'
15893+
// predicate. Similarly to the reduce and case, -0.0 doesn't have to be
15894+
// handled here (see explanation above).
15895+
assert(Extended.getValueSizeInBits() == 64);
15896+
Extended = DAG.getBitcast(MVT::f64, Extended);
15897+
Result =
15898+
DAG.getSetCC(DL, MVT::i32, Extended,
15899+
DAG.getConstantFP(0.0, DL, MVT::f64), ISD::SETUNE);
15900+
} else {
15901+
Result = DAG.getNode(ISD::VECREDUCE_UMAX, DL, ExtendedVT, Extended);
15902+
}
1586115903
break;
1586215904
case ISD::XOR:
1586315905
Result = DAG.getNode(ISD::VECREDUCE_ADD, DL, ExtendedVT, Extended);

llvm/test/CodeGen/AArch64/dag-combine-setcc.ll

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,8 @@ define i1 @combine_setcc_eq_vecreduce_or_v8i1(<8 x i8> %a) {
55
; CHECK-LABEL: combine_setcc_eq_vecreduce_or_v8i1:
66
; CHECK: // %bb.0:
77
; CHECK-NEXT: cmeq v0.8b, v0.8b, #0
8-
; CHECK-NEXT: mov w8, #1 // =0x1
9-
; CHECK-NEXT: umaxv b0, v0.8b
10-
; CHECK-NEXT: fmov w9, s0
11-
; CHECK-NEXT: bic w0, w8, w9
8+
; CHECK-NEXT: fcmp d0, #0.0
9+
; CHECK-NEXT: cset w0, eq
1210
; CHECK-NEXT: ret
1311
%cmp1 = icmp eq <8 x i8> %a, zeroinitializer
1412
%cast = bitcast <8 x i1> %cmp1 to i8
@@ -73,9 +71,8 @@ define i1 @combine_setcc_ne_vecreduce_or_v8i1(<8 x i8> %a) {
7371
; CHECK-LABEL: combine_setcc_ne_vecreduce_or_v8i1:
7472
; CHECK: // %bb.0:
7573
; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b
76-
; CHECK-NEXT: umaxv b0, v0.8b
77-
; CHECK-NEXT: fmov w8, s0
78-
; CHECK-NEXT: and w0, w8, #0x1
74+
; CHECK-NEXT: fcmp d0, #0.0
75+
; CHECK-NEXT: cset w0, ne
7976
; CHECK-NEXT: ret
8077
%cmp1 = icmp ne <8 x i8> %a, zeroinitializer
8178
%cast = bitcast <8 x i1> %cmp1 to i8
@@ -132,10 +129,9 @@ define i1 @combine_setcc_ne_vecreduce_or_v64i1(<64 x i8> %a) {
132129
define i1 @combine_setcc_eq_vecreduce_and_v8i1(<8 x i8> %a) {
133130
; CHECK-LABEL: combine_setcc_eq_vecreduce_and_v8i1:
134131
; CHECK: // %bb.0:
135-
; CHECK-NEXT: cmeq v0.8b, v0.8b, #0
136-
; CHECK-NEXT: uminv b0, v0.8b
137-
; CHECK-NEXT: fmov w8, s0
138-
; CHECK-NEXT: and w0, w8, #0x1
132+
; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b
133+
; CHECK-NEXT: fcmp d0, #0.0
134+
; CHECK-NEXT: cset w0, eq
139135
; CHECK-NEXT: ret
140136
%cmp1 = icmp eq <8 x i8> %a, zeroinitializer
141137
%cast = bitcast <8 x i1> %cmp1 to i8
@@ -192,11 +188,9 @@ define i1 @combine_setcc_eq_vecreduce_and_v64i1(<64 x i8> %a) {
192188
define i1 @combine_setcc_ne_vecreduce_and_v8i1(<8 x i8> %a) {
193189
; CHECK-LABEL: combine_setcc_ne_vecreduce_and_v8i1:
194190
; CHECK: // %bb.0:
195-
; CHECK-NEXT: cmtst v0.8b, v0.8b, v0.8b
196-
; CHECK-NEXT: mov w8, #1 // =0x1
197-
; CHECK-NEXT: uminv b0, v0.8b
198-
; CHECK-NEXT: fmov w9, s0
199-
; CHECK-NEXT: bic w0, w8, w9
191+
; CHECK-NEXT: cmeq v0.8b, v0.8b, #0
192+
; CHECK-NEXT: fcmp d0, #0.0
193+
; CHECK-NEXT: cset w0, ne
200194
; CHECK-NEXT: ret
201195
%cmp1 = icmp ne <8 x i8> %a, zeroinitializer
202196
%cast = bitcast <8 x i1> %cmp1 to i8

llvm/test/CodeGen/AArch64/illegal-floating-point-vector-compares.ll

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,11 @@ define i1 @unordered_floating_point_compare_on_v8f32(<8 x float> %a_vec) {
99
; CHECK: // %bb.0:
1010
; CHECK-NEXT: fcmgt v1.4s, v1.4s, #0.0
1111
; CHECK-NEXT: fcmgt v0.4s, v0.4s, #0.0
12-
; CHECK-NEXT: mov w8, #1 // =0x1
1312
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
1413
; CHECK-NEXT: mvn v0.16b, v0.16b
1514
; CHECK-NEXT: xtn v0.8b, v0.8h
16-
; CHECK-NEXT: umaxv b0, v0.8b
17-
; CHECK-NEXT: fmov w9, s0
18-
; CHECK-NEXT: bic w0, w8, w9
15+
; CHECK-NEXT: fcmp d0, #0.0
16+
; CHECK-NEXT: cset w0, eq
1917
; CHECK-NEXT: ret
2018
%a_cmp = fcmp ule <8 x float> %a_vec, zeroinitializer
2119
%cmp_result = bitcast <8 x i1> %a_cmp to i8

llvm/test/CodeGen/AArch64/reduce-and.ll

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,11 @@ define i1 @test_redand_v1i1(<1 x i1> %a) {
2020
define i1 @test_redand_v2i1(<2 x i1> %a) {
2121
; CHECK-LABEL: test_redand_v2i1:
2222
; CHECK: // %bb.0:
23+
; CHECK-NEXT: mvn v0.8b, v0.8b
2324
; CHECK-NEXT: shl v0.2s, v0.2s, #31
2425
; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
25-
; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s
26-
; CHECK-NEXT: fmov w8, s0
27-
; CHECK-NEXT: and w0, w8, #0x1
26+
; CHECK-NEXT: fcmp d0, #0.0
27+
; CHECK-NEXT: cset w0, eq
2828
; CHECK-NEXT: ret
2929
;
3030
; GISEL-LABEL: test_redand_v2i1:
@@ -42,11 +42,11 @@ define i1 @test_redand_v2i1(<2 x i1> %a) {
4242
define i1 @test_redand_v4i1(<4 x i1> %a) {
4343
; CHECK-LABEL: test_redand_v4i1:
4444
; CHECK: // %bb.0:
45+
; CHECK-NEXT: mvn v0.8b, v0.8b
4546
; CHECK-NEXT: shl v0.4h, v0.4h, #15
4647
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
47-
; CHECK-NEXT: uminv h0, v0.4h
48-
; CHECK-NEXT: fmov w8, s0
49-
; CHECK-NEXT: and w0, w8, #0x1
48+
; CHECK-NEXT: fcmp d0, #0.0
49+
; CHECK-NEXT: cset w0, eq
5050
; CHECK-NEXT: ret
5151
;
5252
; GISEL-LABEL: test_redand_v4i1:
@@ -68,11 +68,11 @@ define i1 @test_redand_v4i1(<4 x i1> %a) {
6868
define i1 @test_redand_v8i1(<8 x i1> %a) {
6969
; CHECK-LABEL: test_redand_v8i1:
7070
; CHECK: // %bb.0:
71+
; CHECK-NEXT: mvn v0.8b, v0.8b
7172
; CHECK-NEXT: shl v0.8b, v0.8b, #7
7273
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
73-
; CHECK-NEXT: uminv b0, v0.8b
74-
; CHECK-NEXT: fmov w8, s0
75-
; CHECK-NEXT: and w0, w8, #0x1
74+
; CHECK-NEXT: fcmp d0, #0.0
75+
; CHECK-NEXT: cset w0, eq
7676
; CHECK-NEXT: ret
7777
;
7878
; GISEL-LABEL: test_redand_v8i1:

llvm/test/CodeGen/AArch64/reduce-or.ll

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,8 @@ define i1 @test_redor_v2i1(<2 x i1> %a) {
2222
; CHECK: // %bb.0:
2323
; CHECK-NEXT: shl v0.2s, v0.2s, #31
2424
; CHECK-NEXT: cmlt v0.2s, v0.2s, #0
25-
; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s
26-
; CHECK-NEXT: fmov w8, s0
27-
; CHECK-NEXT: and w0, w8, #0x1
25+
; CHECK-NEXT: fcmp d0, #0.0
26+
; CHECK-NEXT: cset w0, ne
2827
; CHECK-NEXT: ret
2928
;
3029
; GISEL-LABEL: test_redor_v2i1:
@@ -44,9 +43,8 @@ define i1 @test_redor_v4i1(<4 x i1> %a) {
4443
; CHECK: // %bb.0:
4544
; CHECK-NEXT: shl v0.4h, v0.4h, #15
4645
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
47-
; CHECK-NEXT: umaxv h0, v0.4h
48-
; CHECK-NEXT: fmov w8, s0
49-
; CHECK-NEXT: and w0, w8, #0x1
46+
; CHECK-NEXT: fcmp d0, #0.0
47+
; CHECK-NEXT: cset w0, ne
5048
; CHECK-NEXT: ret
5149
;
5250
; GISEL-LABEL: test_redor_v4i1:
@@ -70,9 +68,8 @@ define i1 @test_redor_v8i1(<8 x i1> %a) {
7068
; CHECK: // %bb.0:
7169
; CHECK-NEXT: shl v0.8b, v0.8b, #7
7270
; CHECK-NEXT: cmlt v0.8b, v0.8b, #0
73-
; CHECK-NEXT: umaxv b0, v0.8b
74-
; CHECK-NEXT: fmov w8, s0
75-
; CHECK-NEXT: and w0, w8, #0x1
71+
; CHECK-NEXT: fcmp d0, #0.0
72+
; CHECK-NEXT: cset w0, ne
7673
; CHECK-NEXT: ret
7774
;
7875
; GISEL-LABEL: test_redor_v8i1:

llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -139,11 +139,11 @@ define i32 @test_v3i32(<3 x i32> %a) nounwind {
139139
define i1 @test_v4i1(<4 x i1> %a) nounwind {
140140
; CHECK-LABEL: test_v4i1:
141141
; CHECK: // %bb.0:
142+
; CHECK-NEXT: mvn v0.8b, v0.8b
142143
; CHECK-NEXT: shl v0.4h, v0.4h, #15
143144
; CHECK-NEXT: cmlt v0.4h, v0.4h, #0
144-
; CHECK-NEXT: uminv h0, v0.4h
145-
; CHECK-NEXT: fmov w8, s0
146-
; CHECK-NEXT: and w0, w8, #0x1
145+
; CHECK-NEXT: fcmp d0, #0.0
146+
; CHECK-NEXT: cset w0, eq
147147
; CHECK-NEXT: ret
148148
%b = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> %a)
149149
ret i1 %b

0 commit comments

Comments
 (0)