Skip to content

Commit c060757

Browse files
committed
[ARM] Correct v2i1 concat extract types.
For two v2i1 concat into a v4i1, we cannot extract each i64 element as an i32. This casts to a v4i32 instead and extracts the correct vector lanes.
1 parent 561fcf5 commit c060757

File tree

2 files changed

+228
-9
lines changed

2 files changed

+228
-9
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9082,6 +9082,8 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
90829082
EVT Op1VT = V1.getValueType();
90839083
EVT Op2VT = V2.getValueType();
90849084
assert(Op1VT == Op2VT && "Operand types don't match!");
9085+
assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
9086+
"Unexpected i1 concat operations!");
90859087
EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
90869088

90879089
SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
@@ -9103,9 +9105,14 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
91039105
auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
91049106
EVT NewVT = NewV.getValueType();
91059107
EVT ConcatVT = ConVec.getValueType();
9108+
unsigned ExtScale = 1;
9109+
if (NewVT == MVT::v2f64) {
9110+
NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
9111+
ExtScale = 2;
9112+
}
91069113
for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
91079114
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
9108-
DAG.getIntPtrConstant(i, dl));
9115+
DAG.getIntPtrConstant(i * ExtScale, dl));
91099116
ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
91109117
DAG.getConstant(j, dl, MVT::i32));
91119118
}
@@ -9116,14 +9123,7 @@ static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG,
91169123
ConVec = ExtractInto(NewV2, ConVec, j);
91179124

91189125
// Now return the result of comparing the subvector with zero, which will
9119-
// generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1 we
9120-
// convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
9121-
if (VT == MVT::v2i1) {
9122-
SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, ConVec);
9123-
SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
9124-
DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9125-
return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9126-
}
9126+
// generate a real predicate, i.e. v4i1, v8i1 or v16i1.
91279127
return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
91289128
DAG.getConstant(ARMCC::NE, dl, MVT::i32));
91299129
};
Lines changed: 219 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s
3+
4+
define arm_aapcs_vfpcc <4 x i32> @concat_v2i1(i32 %a, i32 %b, <4 x i32> %c) {
5+
; CHECK-LABEL: concat_v2i1:
6+
; CHECK: @ %bb.0: @ %entry
7+
; CHECK-NEXT: vmsr p0, r1
8+
; CHECK-NEXT: vmov.i8 q1, #0x0
9+
; CHECK-NEXT: vmov.i8 q2, #0xff
10+
; CHECK-NEXT: vpsel q3, q2, q1
11+
; CHECK-NEXT: vmsr p0, r0
12+
; CHECK-NEXT: vpsel q1, q2, q1
13+
; CHECK-NEXT: vmov r1, s12
14+
; CHECK-NEXT: vmov r0, s4
15+
; CHECK-NEXT: vmov q2[2], q2[0], r0, r1
16+
; CHECK-NEXT: vmov r1, s6
17+
; CHECK-NEXT: vmov r0, s14
18+
; CHECK-NEXT: vmov.i32 q1, #0x0
19+
; CHECK-NEXT: vmov q2[3], q2[1], r1, r0
20+
; CHECK-NEXT: vcmp.i32 ne, q2, zr
21+
; CHECK-NEXT: vpsel q0, q0, q1
22+
; CHECK-NEXT: bx lr
23+
entry:
24+
%ai = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %a)
25+
%bi = call <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32 %b)
26+
%s = shufflevector <2 x i1> %ai, <2 x i1> %bi, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
27+
%ci = select <4 x i1> %s, <4 x i32> %c, <4 x i32> zeroinitializer
28+
ret <4 x i32> %ci
29+
}
30+
31+
declare <2 x i1> @llvm.arm.mve.pred.i2v.v2i1(i32)
32+
33+
34+
define arm_aapcs_vfpcc <8 x i16> @concat_v4i1(<4 x i32> %a, <4 x i32> %b, <8 x i16> %c) {
35+
; CHECK-LABEL: concat_v4i1:
36+
; CHECK: @ %bb.0: @ %entry
37+
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
38+
; CHECK-NEXT: vpush {d8, d9, d10, d11}
39+
; CHECK-NEXT: vmov.i8 q3, #0x0
40+
; CHECK-NEXT: vmov.i8 q4, #0xff
41+
; CHECK-NEXT: vcmp.s32 lt, q0, zr
42+
; CHECK-NEXT: vpsel q5, q4, q3
43+
; CHECK-NEXT: vcmp.s32 lt, q1, zr
44+
; CHECK-NEXT: vmov r0, r1, d10
45+
; CHECK-NEXT: vpsel q1, q4, q3
46+
; CHECK-NEXT: vmov.16 q0[0], r0
47+
; CHECK-NEXT: vmov.16 q0[1], r1
48+
; CHECK-NEXT: vmov r0, r1, d11
49+
; CHECK-NEXT: vmov.16 q0[2], r0
50+
; CHECK-NEXT: vmov.16 q0[3], r1
51+
; CHECK-NEXT: vmov r0, r1, d2
52+
; CHECK-NEXT: vmov.16 q0[4], r0
53+
; CHECK-NEXT: vmov.16 q0[5], r1
54+
; CHECK-NEXT: vmov r0, r1, d3
55+
; CHECK-NEXT: vmov.16 q0[6], r0
56+
; CHECK-NEXT: vmov.16 q0[7], r1
57+
; CHECK-NEXT: vcmp.i16 ne, q0, zr
58+
; CHECK-NEXT: vmov.i32 q0, #0x0
59+
; CHECK-NEXT: vpsel q0, q2, q0
60+
; CHECK-NEXT: vpop {d8, d9, d10, d11}
61+
; CHECK-NEXT: bx lr
62+
entry:
63+
%ai = icmp slt <4 x i32> %a, zeroinitializer
64+
%bi = icmp slt <4 x i32> %b, zeroinitializer
65+
%s = shufflevector <4 x i1> %ai, <4 x i1> %bi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
66+
%ci = select <8 x i1> %s, <8 x i16> %c, <8 x i16> zeroinitializer
67+
ret <8 x i16> %ci
68+
}
69+
70+
define arm_aapcs_vfpcc <16 x i8> @concat_v8i1(<8 x i16> %a, <8 x i16> %b, <16 x i8> %c) {
71+
; CHECK-LABEL: concat_v8i1:
72+
; CHECK: @ %bb.0: @ %entry
73+
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
74+
; CHECK-NEXT: vpush {d8, d9, d10, d11}
75+
; CHECK-NEXT: vmov.i8 q3, #0x0
76+
; CHECK-NEXT: vmov.i8 q4, #0xff
77+
; CHECK-NEXT: vcmp.s16 lt, q0, zr
78+
; CHECK-NEXT: vpsel q5, q4, q3
79+
; CHECK-NEXT: vcmp.s16 lt, q1, zr
80+
; CHECK-NEXT: vmov.u16 r0, q5[0]
81+
; CHECK-NEXT: vpsel q1, q4, q3
82+
; CHECK-NEXT: vmov.8 q0[0], r0
83+
; CHECK-NEXT: vmov.u16 r0, q5[1]
84+
; CHECK-NEXT: vmov.8 q0[1], r0
85+
; CHECK-NEXT: vmov.u16 r0, q5[2]
86+
; CHECK-NEXT: vmov.8 q0[2], r0
87+
; CHECK-NEXT: vmov.u16 r0, q5[3]
88+
; CHECK-NEXT: vmov.8 q0[3], r0
89+
; CHECK-NEXT: vmov.u16 r0, q5[4]
90+
; CHECK-NEXT: vmov.8 q0[4], r0
91+
; CHECK-NEXT: vmov.u16 r0, q5[5]
92+
; CHECK-NEXT: vmov.8 q0[5], r0
93+
; CHECK-NEXT: vmov.u16 r0, q5[6]
94+
; CHECK-NEXT: vmov.8 q0[6], r0
95+
; CHECK-NEXT: vmov.u16 r0, q5[7]
96+
; CHECK-NEXT: vmov.8 q0[7], r0
97+
; CHECK-NEXT: vmov.u16 r0, q1[0]
98+
; CHECK-NEXT: vmov.8 q0[8], r0
99+
; CHECK-NEXT: vmov.u16 r0, q1[1]
100+
; CHECK-NEXT: vmov.8 q0[9], r0
101+
; CHECK-NEXT: vmov.u16 r0, q1[2]
102+
; CHECK-NEXT: vmov.8 q0[10], r0
103+
; CHECK-NEXT: vmov.u16 r0, q1[3]
104+
; CHECK-NEXT: vmov.8 q0[11], r0
105+
; CHECK-NEXT: vmov.u16 r0, q1[4]
106+
; CHECK-NEXT: vmov.8 q0[12], r0
107+
; CHECK-NEXT: vmov.u16 r0, q1[5]
108+
; CHECK-NEXT: vmov.8 q0[13], r0
109+
; CHECK-NEXT: vmov.u16 r0, q1[6]
110+
; CHECK-NEXT: vmov.8 q0[14], r0
111+
; CHECK-NEXT: vmov.u16 r0, q1[7]
112+
; CHECK-NEXT: vmov.8 q0[15], r0
113+
; CHECK-NEXT: vcmp.i8 ne, q0, zr
114+
; CHECK-NEXT: vmov.i32 q0, #0x0
115+
; CHECK-NEXT: vpsel q0, q2, q0
116+
; CHECK-NEXT: vpop {d8, d9, d10, d11}
117+
; CHECK-NEXT: bx lr
118+
entry:
119+
%ai = icmp slt <8 x i16> %a, zeroinitializer
120+
%bi = icmp slt <8 x i16> %b, zeroinitializer
121+
%s = shufflevector <8 x i1> %ai, <8 x i1> %bi, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
122+
%ci = select <16 x i1> %s, <16 x i8> %c, <16 x i8> zeroinitializer
123+
ret <16 x i8> %ci
124+
}
125+
126+
127+
define arm_aapcs_vfpcc <16 x i8> @concat_v48i1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %d, <4 x i32> %e, <16 x i8> %c) {
128+
; CHECK-LABEL: concat_v48i1:
129+
; CHECK: @ %bb.0: @ %entry
130+
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
131+
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
132+
; CHECK-NEXT: vmov.i8 q4, #0x0
133+
; CHECK-NEXT: vmov.i8 q5, #0xff
134+
; CHECK-NEXT: vcmp.s32 lt, q0, zr
135+
; CHECK-NEXT: vpsel q6, q5, q4
136+
; CHECK-NEXT: vcmp.s32 lt, q1, zr
137+
; CHECK-NEXT: vmov r0, r1, d12
138+
; CHECK-NEXT: vpsel q1, q5, q4
139+
; CHECK-NEXT: vmov.16 q0[0], r0
140+
; CHECK-NEXT: vmov.16 q0[1], r1
141+
; CHECK-NEXT: vmov r0, r1, d13
142+
; CHECK-NEXT: vmov.16 q0[2], r0
143+
; CHECK-NEXT: vmov.16 q0[3], r1
144+
; CHECK-NEXT: vmov r0, r1, d2
145+
; CHECK-NEXT: vmov.16 q0[4], r0
146+
; CHECK-NEXT: vmov.16 q0[5], r1
147+
; CHECK-NEXT: vmov r0, r1, d3
148+
; CHECK-NEXT: vmov.16 q0[6], r0
149+
; CHECK-NEXT: vmov.16 q0[7], r1
150+
; CHECK-NEXT: vcmp.i16 ne, q0, zr
151+
; CHECK-NEXT: vpsel q1, q5, q4
152+
; CHECK-NEXT: vcmp.s32 lt, q2, zr
153+
; CHECK-NEXT: vmov.u16 r0, q1[0]
154+
; CHECK-NEXT: vpsel q2, q5, q4
155+
; CHECK-NEXT: vmov.8 q0[0], r0
156+
; CHECK-NEXT: vmov.u16 r0, q1[1]
157+
; CHECK-NEXT: vmov.8 q0[1], r0
158+
; CHECK-NEXT: vmov.u16 r0, q1[2]
159+
; CHECK-NEXT: vmov.8 q0[2], r0
160+
; CHECK-NEXT: vmov.u16 r0, q1[3]
161+
; CHECK-NEXT: vmov.8 q0[3], r0
162+
; CHECK-NEXT: vmov.u16 r0, q1[4]
163+
; CHECK-NEXT: vmov.8 q0[4], r0
164+
; CHECK-NEXT: vmov.u16 r0, q1[5]
165+
; CHECK-NEXT: vmov.8 q0[5], r0
166+
; CHECK-NEXT: vmov.u16 r0, q1[6]
167+
; CHECK-NEXT: vmov.8 q0[6], r0
168+
; CHECK-NEXT: vmov.u16 r0, q1[7]
169+
; CHECK-NEXT: vmov.8 q0[7], r0
170+
; CHECK-NEXT: vmov r0, r1, d4
171+
; CHECK-NEXT: vmov.16 q1[0], r0
172+
; CHECK-NEXT: vcmp.s32 lt, q3, zr
173+
; CHECK-NEXT: vmov.16 q1[1], r1
174+
; CHECK-NEXT: vmov r0, r1, d5
175+
; CHECK-NEXT: vmov.16 q1[2], r0
176+
; CHECK-NEXT: vpsel q2, q5, q4
177+
; CHECK-NEXT: vmov.16 q1[3], r1
178+
; CHECK-NEXT: vmov r0, r1, d4
179+
; CHECK-NEXT: vmov.16 q1[4], r0
180+
; CHECK-NEXT: vmov.16 q1[5], r1
181+
; CHECK-NEXT: vmov r0, r1, d5
182+
; CHECK-NEXT: vmov.16 q1[6], r0
183+
; CHECK-NEXT: vmov.16 q1[7], r1
184+
; CHECK-NEXT: vcmp.i16 ne, q1, zr
185+
; CHECK-NEXT: vpsel q1, q5, q4
186+
; CHECK-NEXT: vmov.u16 r0, q1[0]
187+
; CHECK-NEXT: vmov.8 q0[8], r0
188+
; CHECK-NEXT: vmov.u16 r0, q1[1]
189+
; CHECK-NEXT: vmov.8 q0[9], r0
190+
; CHECK-NEXT: vmov.u16 r0, q1[2]
191+
; CHECK-NEXT: vmov.8 q0[10], r0
192+
; CHECK-NEXT: vmov.u16 r0, q1[3]
193+
; CHECK-NEXT: vmov.8 q0[11], r0
194+
; CHECK-NEXT: vmov.u16 r0, q1[4]
195+
; CHECK-NEXT: vmov.8 q0[12], r0
196+
; CHECK-NEXT: vmov.u16 r0, q1[5]
197+
; CHECK-NEXT: vmov.8 q0[13], r0
198+
; CHECK-NEXT: vmov.u16 r0, q1[6]
199+
; CHECK-NEXT: vmov.8 q0[14], r0
200+
; CHECK-NEXT: vmov.u16 r0, q1[7]
201+
; CHECK-NEXT: vmov.8 q0[15], r0
202+
; CHECK-NEXT: add r0, sp, #48
203+
; CHECK-NEXT: vldrw.u32 q1, [r0]
204+
; CHECK-NEXT: vcmp.i8 ne, q0, zr
205+
; CHECK-NEXT: vmov.i32 q0, #0x0
206+
; CHECK-NEXT: vpsel q0, q1, q0
207+
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
208+
; CHECK-NEXT: bx lr
209+
entry:
210+
%ai = icmp slt <4 x i32> %a, zeroinitializer
211+
%bi = icmp slt <4 x i32> %b, zeroinitializer
212+
%di = icmp slt <4 x i32> %d, zeroinitializer
213+
%ei = icmp slt <4 x i32> %e, zeroinitializer
214+
%s1 = shufflevector <4 x i1> %ai, <4 x i1> %bi, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
215+
%s2 = shufflevector <4 x i1> %di, <4 x i1> %ei, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
216+
%s = shufflevector <8 x i1> %s1, <8 x i1> %s2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
217+
%ci = select <16 x i1> %s, <16 x i8> %c, <16 x i8> zeroinitializer
218+
ret <16 x i8> %ci
219+
}

0 commit comments

Comments
 (0)