Skip to content

Commit 5cf75ec

Browse files
committed
[AArch64][SME]: Generate streaming-compatible code for int/fp select/vselect
To generate code compatible to streaming mode: - enable custom lowering for VSETCC, needed for (fp-vselect.ll, int-vselect.ll). Differential Revision: https://reviews.llvm.org/D138519
1 parent 39641b1 commit 5cf75ec

File tree

3 files changed

+90
-68
lines changed

3 files changed

+90
-68
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12901,7 +12901,8 @@ SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
1290112901
if (Op.getValueType().isScalableVector())
1290212902
return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
1290312903

12904-
if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
12904+
if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType(),
12905+
Subtarget->forceStreamingCompatibleSVE()))
1290512906
return LowerFixedLengthVectorSetccToSVE(Op, DAG);
1290612907

1290712908
ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
@@ -22867,7 +22868,7 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorSetccToSVE(
2286722868
EVT InVT = Op.getOperand(0).getValueType();
2286822869
EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT);
2286922870

22870-
assert(useSVEForFixedLengthVectorVT(InVT) &&
22871+
assert(InVT.isFixedLengthVector() && isTypeLegal(InVT) &&
2287122872
"Only expected to lower fixed length vector operation!");
2287222873
assert(Op.getValueType() == InVT.changeTypeToInteger() &&
2287322874
"Expected integer result of the same bit length as the inputs!");

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fp-vselect.ll

Lines changed: 39 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -87,18 +87,21 @@ define void @select_v16f16(ptr %a, ptr %b) #0 {
8787
; CHECK: // %bb.0:
8888
; CHECK-NEXT: ldp q0, q1, [x1]
8989
; CHECK-NEXT: adrp x8, .LCPI3_0
90+
; CHECK-NEXT: ptrue p0.h, vl8
9091
; CHECK-NEXT: ldp q3, q2, [x0]
91-
; CHECK-NEXT: fcmeq v5.8h, v3.8h, v0.8h
92-
; CHECK-NEXT: fcmeq v4.8h, v2.8h, v1.8h
93-
; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI3_0]
94-
; CHECK-NEXT: and z3.d, z3.d, z5.d
95-
; CHECK-NEXT: and z2.d, z2.d, z4.d
96-
; CHECK-NEXT: eor z4.d, z4.d, z6.d
97-
; CHECK-NEXT: eor z6.d, z5.d, z6.d
98-
; CHECK-NEXT: and z1.d, z1.d, z4.d
99-
; CHECK-NEXT: and z0.d, z0.d, z6.d
100-
; CHECK-NEXT: orr z1.d, z2.d, z1.d
92+
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_0]
93+
; CHECK-NEXT: fcmeq p1.h, p0/z, z2.h, z1.h
94+
; CHECK-NEXT: fcmeq p0.h, p0/z, z3.h, z0.h
95+
; CHECK-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff
96+
; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff
97+
; CHECK-NEXT: and z2.d, z2.d, z5.d
98+
; CHECK-NEXT: eor z5.d, z5.d, z4.d
99+
; CHECK-NEXT: eor z4.d, z6.d, z4.d
100+
; CHECK-NEXT: and z3.d, z3.d, z6.d
101+
; CHECK-NEXT: and z0.d, z0.d, z4.d
102+
; CHECK-NEXT: and z1.d, z1.d, z5.d
101103
; CHECK-NEXT: orr z0.d, z3.d, z0.d
104+
; CHECK-NEXT: orr z1.d, z2.d, z1.d
102105
; CHECK-NEXT: stp q0, q1, [x0]
103106
; CHECK-NEXT: ret
104107
%op1 = load <16 x half>, ptr %a
@@ -161,18 +164,21 @@ define void @select_v8f32(ptr %a, ptr %b) #0 {
161164
; CHECK: // %bb.0:
162165
; CHECK-NEXT: ldp q0, q1, [x1]
163166
; CHECK-NEXT: adrp x8, .LCPI6_0
167+
; CHECK-NEXT: ptrue p0.s, vl4
164168
; CHECK-NEXT: ldp q3, q2, [x0]
165-
; CHECK-NEXT: fcmeq v5.4s, v3.4s, v0.4s
166-
; CHECK-NEXT: fcmeq v4.4s, v2.4s, v1.4s
167-
; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI6_0]
168-
; CHECK-NEXT: and z3.d, z3.d, z5.d
169-
; CHECK-NEXT: and z2.d, z2.d, z4.d
170-
; CHECK-NEXT: eor z4.d, z4.d, z6.d
171-
; CHECK-NEXT: eor z6.d, z5.d, z6.d
172-
; CHECK-NEXT: and z1.d, z1.d, z4.d
173-
; CHECK-NEXT: and z0.d, z0.d, z6.d
174-
; CHECK-NEXT: orr z1.d, z2.d, z1.d
169+
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI6_0]
170+
; CHECK-NEXT: fcmeq p1.s, p0/z, z2.s, z1.s
171+
; CHECK-NEXT: fcmeq p0.s, p0/z, z3.s, z0.s
172+
; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff
173+
; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff
174+
; CHECK-NEXT: and z2.d, z2.d, z5.d
175+
; CHECK-NEXT: eor z5.d, z5.d, z4.d
176+
; CHECK-NEXT: eor z4.d, z6.d, z4.d
177+
; CHECK-NEXT: and z3.d, z3.d, z6.d
178+
; CHECK-NEXT: and z0.d, z0.d, z4.d
179+
; CHECK-NEXT: and z1.d, z1.d, z5.d
175180
; CHECK-NEXT: orr z0.d, z3.d, z0.d
181+
; CHECK-NEXT: orr z1.d, z2.d, z1.d
176182
; CHECK-NEXT: stp q0, q1, [x0]
177183
; CHECK-NEXT: ret
178184
%op1 = load <8 x float>, ptr %a
@@ -232,18 +238,21 @@ define void @select_v4f64(ptr %a, ptr %b) #0 {
232238
; CHECK: // %bb.0:
233239
; CHECK-NEXT: ldp q0, q1, [x1]
234240
; CHECK-NEXT: adrp x8, .LCPI9_0
241+
; CHECK-NEXT: ptrue p0.d, vl2
235242
; CHECK-NEXT: ldp q3, q2, [x0]
236-
; CHECK-NEXT: fcmeq v5.2d, v3.2d, v0.2d
237-
; CHECK-NEXT: fcmeq v4.2d, v2.2d, v1.2d
238-
; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI9_0]
239-
; CHECK-NEXT: and z3.d, z3.d, z5.d
240-
; CHECK-NEXT: and z2.d, z2.d, z4.d
241-
; CHECK-NEXT: eor z4.d, z4.d, z6.d
242-
; CHECK-NEXT: eor z6.d, z5.d, z6.d
243-
; CHECK-NEXT: and z1.d, z1.d, z4.d
244-
; CHECK-NEXT: and z0.d, z0.d, z6.d
245-
; CHECK-NEXT: orr z1.d, z2.d, z1.d
243+
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI9_0]
244+
; CHECK-NEXT: fcmeq p1.d, p0/z, z2.d, z1.d
245+
; CHECK-NEXT: fcmeq p0.d, p0/z, z3.d, z0.d
246+
; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff
247+
; CHECK-NEXT: mov z6.d, p0/z, #-1 // =0xffffffffffffffff
248+
; CHECK-NEXT: and z2.d, z2.d, z5.d
249+
; CHECK-NEXT: eor z5.d, z5.d, z4.d
250+
; CHECK-NEXT: eor z4.d, z6.d, z4.d
251+
; CHECK-NEXT: and z3.d, z3.d, z6.d
252+
; CHECK-NEXT: and z0.d, z0.d, z4.d
253+
; CHECK-NEXT: and z1.d, z1.d, z5.d
246254
; CHECK-NEXT: orr z0.d, z3.d, z0.d
255+
; CHECK-NEXT: orr z1.d, z2.d, z1.d
247256
; CHECK-NEXT: stp q0, q1, [x0]
248257
; CHECK-NEXT: ret
249258
%op1 = load <4 x double>, ptr %a

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-vselect.ll

Lines changed: 48 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -75,21 +75,24 @@ define <16 x i8> @select_v16i8(<16 x i8> %op1, <16 x i8> %op2, <16 x i1> %mask)
7575
define void @select_v32i8(ptr %a, ptr %b) #0 {
7676
; CHECK-LABEL: select_v32i8:
7777
; CHECK: // %bb.0:
78-
; CHECK-NEXT: ldp q0, q1, [x1]
78+
; CHECK-NEXT: ldp q1, q0, [x1]
7979
; CHECK-NEXT: adrp x8, .LCPI3_0
80+
; CHECK-NEXT: ptrue p0.b, vl16
8081
; CHECK-NEXT: ldp q3, q2, [x0]
81-
; CHECK-NEXT: cmeq v6.16b, v3.16b, v0.16b
8282
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI3_0]
83-
; CHECK-NEXT: and z3.d, z3.d, z6.d
84-
; CHECK-NEXT: cmeq v5.16b, v2.16b, v1.16b
83+
; CHECK-NEXT: cmpeq p1.b, p0/z, z2.b, z0.b
84+
; CHECK-NEXT: cmpeq p0.b, p0/z, z3.b, z1.b
85+
; CHECK-NEXT: mov z5.b, p1/z, #-1 // =0xffffffffffffffff
86+
; CHECK-NEXT: mov z6.b, p0/z, #-1 // =0xffffffffffffffff
8587
; CHECK-NEXT: and z2.d, z2.d, z5.d
8688
; CHECK-NEXT: eor z5.d, z5.d, z4.d
8789
; CHECK-NEXT: eor z4.d, z6.d, z4.d
88-
; CHECK-NEXT: and z1.d, z1.d, z5.d
89-
; CHECK-NEXT: and z0.d, z0.d, z4.d
90-
; CHECK-NEXT: orr z1.d, z2.d, z1.d
91-
; CHECK-NEXT: orr z0.d, z3.d, z0.d
92-
; CHECK-NEXT: stp q0, q1, [x0]
90+
; CHECK-NEXT: and z3.d, z3.d, z6.d
91+
; CHECK-NEXT: and z1.d, z1.d, z4.d
92+
; CHECK-NEXT: and z0.d, z0.d, z5.d
93+
; CHECK-NEXT: orr z1.d, z3.d, z1.d
94+
; CHECK-NEXT: orr z0.d, z2.d, z0.d
95+
; CHECK-NEXT: stp q1, q0, [x0]
9396
; CHECK-NEXT: ret
9497
%op1 = load <32 x i8>, ptr %a
9598
%op2 = load <32 x i8>, ptr %b
@@ -172,21 +175,24 @@ define <8 x i16> @select_v8i16(<8 x i16> %op1, <8 x i16> %op2, <8 x i1> %mask) #
172175
define void @select_v16i16(ptr %a, ptr %b) #0 {
173176
; CHECK-LABEL: select_v16i16:
174177
; CHECK: // %bb.0:
175-
; CHECK-NEXT: ldp q0, q1, [x1]
178+
; CHECK-NEXT: ldp q1, q0, [x1]
176179
; CHECK-NEXT: adrp x8, .LCPI7_0
180+
; CHECK-NEXT: ptrue p0.h, vl8
177181
; CHECK-NEXT: ldp q3, q2, [x0]
178-
; CHECK-NEXT: cmeq v6.8h, v3.8h, v0.8h
179182
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI7_0]
180-
; CHECK-NEXT: and z3.d, z3.d, z6.d
181-
; CHECK-NEXT: cmeq v5.8h, v2.8h, v1.8h
183+
; CHECK-NEXT: cmpeq p1.h, p0/z, z2.h, z0.h
184+
; CHECK-NEXT: cmpeq p0.h, p0/z, z3.h, z1.h
185+
; CHECK-NEXT: mov z5.h, p1/z, #-1 // =0xffffffffffffffff
186+
; CHECK-NEXT: mov z6.h, p0/z, #-1 // =0xffffffffffffffff
182187
; CHECK-NEXT: and z2.d, z2.d, z5.d
183188
; CHECK-NEXT: eor z5.d, z5.d, z4.d
184189
; CHECK-NEXT: eor z4.d, z6.d, z4.d
185-
; CHECK-NEXT: and z1.d, z1.d, z5.d
186-
; CHECK-NEXT: and z0.d, z0.d, z4.d
187-
; CHECK-NEXT: orr z1.d, z2.d, z1.d
188-
; CHECK-NEXT: orr z0.d, z3.d, z0.d
189-
; CHECK-NEXT: stp q0, q1, [x0]
190+
; CHECK-NEXT: and z3.d, z3.d, z6.d
191+
; CHECK-NEXT: and z1.d, z1.d, z4.d
192+
; CHECK-NEXT: and z0.d, z0.d, z5.d
193+
; CHECK-NEXT: orr z1.d, z3.d, z1.d
194+
; CHECK-NEXT: orr z0.d, z2.d, z0.d
195+
; CHECK-NEXT: stp q1, q0, [x0]
190196
; CHECK-NEXT: ret
191197
%op1 = load <16 x i16>, ptr %a
192198
%op2 = load <16 x i16>, ptr %b
@@ -246,21 +252,24 @@ define <4 x i32> @select_v4i32(<4 x i32> %op1, <4 x i32> %op2, <4 x i1> %mask) #
246252
define void @select_v8i32(ptr %a, ptr %b) #0 {
247253
; CHECK-LABEL: select_v8i32:
248254
; CHECK: // %bb.0:
249-
; CHECK-NEXT: ldp q0, q1, [x1]
255+
; CHECK-NEXT: ldp q1, q0, [x1]
250256
; CHECK-NEXT: adrp x8, .LCPI10_0
257+
; CHECK-NEXT: ptrue p0.s, vl4
251258
; CHECK-NEXT: ldp q3, q2, [x0]
252-
; CHECK-NEXT: cmeq v6.4s, v3.4s, v0.4s
253259
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI10_0]
254-
; CHECK-NEXT: and z3.d, z3.d, z6.d
255-
; CHECK-NEXT: cmeq v5.4s, v2.4s, v1.4s
260+
; CHECK-NEXT: cmpeq p1.s, p0/z, z2.s, z0.s
261+
; CHECK-NEXT: cmpeq p0.s, p0/z, z3.s, z1.s
262+
; CHECK-NEXT: mov z5.s, p1/z, #-1 // =0xffffffffffffffff
263+
; CHECK-NEXT: mov z6.s, p0/z, #-1 // =0xffffffffffffffff
256264
; CHECK-NEXT: and z2.d, z2.d, z5.d
257265
; CHECK-NEXT: eor z5.d, z5.d, z4.d
258266
; CHECK-NEXT: eor z4.d, z6.d, z4.d
259-
; CHECK-NEXT: and z1.d, z1.d, z5.d
260-
; CHECK-NEXT: and z0.d, z0.d, z4.d
261-
; CHECK-NEXT: orr z1.d, z2.d, z1.d
262-
; CHECK-NEXT: orr z0.d, z3.d, z0.d
263-
; CHECK-NEXT: stp q0, q1, [x0]
267+
; CHECK-NEXT: and z3.d, z3.d, z6.d
268+
; CHECK-NEXT: and z1.d, z1.d, z4.d
269+
; CHECK-NEXT: and z0.d, z0.d, z5.d
270+
; CHECK-NEXT: orr z1.d, z3.d, z1.d
271+
; CHECK-NEXT: orr z0.d, z2.d, z0.d
272+
; CHECK-NEXT: stp q1, q0, [x0]
264273
; CHECK-NEXT: ret
265274
%op1 = load <8 x i32>, ptr %a
266275
%op2 = load <8 x i32>, ptr %b
@@ -317,21 +326,24 @@ define <2 x i64> @select_v2i64(<2 x i64> %op1, <2 x i64> %op2, <2 x i1> %mask) #
317326
define void @select_v4i64(ptr %a, ptr %b) #0 {
318327
; CHECK-LABEL: select_v4i64:
319328
; CHECK: // %bb.0:
320-
; CHECK-NEXT: ldp q0, q1, [x1]
329+
; CHECK-NEXT: ldp q1, q0, [x1]
321330
; CHECK-NEXT: adrp x8, .LCPI13_0
331+
; CHECK-NEXT: ptrue p0.d, vl2
322332
; CHECK-NEXT: ldp q3, q2, [x0]
323-
; CHECK-NEXT: cmeq v6.2d, v3.2d, v0.2d
324333
; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI13_0]
325-
; CHECK-NEXT: and z3.d, z3.d, z6.d
326-
; CHECK-NEXT: cmeq v5.2d, v2.2d, v1.2d
334+
; CHECK-NEXT: cmpeq p1.d, p0/z, z2.d, z0.d
335+
; CHECK-NEXT: cmpeq p0.d, p0/z, z3.d, z1.d
336+
; CHECK-NEXT: mov z5.d, p1/z, #-1 // =0xffffffffffffffff
337+
; CHECK-NEXT: mov z6.d, p0/z, #-1 // =0xffffffffffffffff
327338
; CHECK-NEXT: and z2.d, z2.d, z5.d
328339
; CHECK-NEXT: eor z5.d, z5.d, z4.d
329340
; CHECK-NEXT: eor z4.d, z6.d, z4.d
330-
; CHECK-NEXT: and z1.d, z1.d, z5.d
331-
; CHECK-NEXT: and z0.d, z0.d, z4.d
332-
; CHECK-NEXT: orr z1.d, z2.d, z1.d
333-
; CHECK-NEXT: orr z0.d, z3.d, z0.d
334-
; CHECK-NEXT: stp q0, q1, [x0]
341+
; CHECK-NEXT: and z3.d, z3.d, z6.d
342+
; CHECK-NEXT: and z1.d, z1.d, z4.d
343+
; CHECK-NEXT: and z0.d, z0.d, z5.d
344+
; CHECK-NEXT: orr z1.d, z3.d, z1.d
345+
; CHECK-NEXT: orr z0.d, z2.d, z0.d
346+
; CHECK-NEXT: stp q1, q0, [x0]
335347
; CHECK-NEXT: ret
336348
%op1 = load <4 x i64>, ptr %a
337349
%op2 = load <4 x i64>, ptr %b

0 commit comments

Comments
 (0)