Skip to content

Commit 3f9d385

Browse files
author
Dinar Temirbulatov
authored
[AArch64][SME] Shuffle lowering, assume that the minimal SVE register is 128-bit, when NOEN is not available. (#71647)
We can assume that the minimal SVE register is 128-bit, when NEON is not available. And we can lower the shuffle shuffle operation with one operand to TBL1 SVE instruction.
1 parent 9cdaeef commit 3f9d385

6 files changed

+71
-179
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26124,6 +26124,9 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
2612426124
bool IsSingleOp =
2612526125
ShuffleVectorInst::isSingleSourceMask(ShuffleMask, ShuffleMask.size());
2612626126

26127+
if (!Subtarget.isNeonAvailable() && !MinSVESize)
26128+
MinSVESize = 128;
26129+
2612726130
// Ignore two operands if no SVE2 or all index numbers couldn't
2612826131
// be represented.
2612926132
if (!IsSingleOp && (!Subtarget.hasSVE2() || MinSVESize != MaxSVESize))
@@ -26135,9 +26138,8 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
2613526138
unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
2613626139
unsigned MaskSize = ShuffleMask.size();
2613726140
uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
26138-
assert(ElementsPerVectorReg <= IndexLen && MaskSize <= IndexLen &&
26141+
assert(ElementsPerVectorReg <= IndexLen && ShuffleMask.size() <= IndexLen &&
2613926142
"Incorrectly legalised shuffle operation");
26140-
(void)MaskSize;
2614126143

2614226144
SmallVector<SDValue, 8> TBLMask;
2614326145
for (int Index : ShuffleMask) {
@@ -26333,8 +26335,10 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE(
2633326335
}
2633426336
}
2633526337

26336-
// Avoid producing TBL instruction if we don't know SVE register minimal size.
26337-
if (MinSVESize)
26338+
// Avoid producing TBL instruction if we don't know SVE register minimal size,
26339+
// unless NEON is not available and we can assume minimal SVE register size is
26340+
// 128-bits.
26341+
if (MinSVESize || !Subtarget->isNeonAvailable())
2633826342
return GenerateFixedLengthSVETBL(Op, Op1, Op2, ShuffleMask, VT, ContainerVT,
2633926343
DAG);
2634026344

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-extract-subvector.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -184,15 +184,11 @@ define void @extract_subvector_v4i64(ptr %a, ptr %b) {
184184
define <2 x half> @extract_subvector_v4f16(<4 x half> %op) {
185185
; CHECK-LABEL: extract_subvector_v4f16:
186186
; CHECK: // %bb.0:
187-
; CHECK-NEXT: sub sp, sp, #16
188-
; CHECK-NEXT: .cfi_def_cfa_offset 16
187+
; CHECK-NEXT: adrp x8, .LCPI12_0
189188
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
190-
; CHECK-NEXT: mov z1.h, z0.h[3]
191-
; CHECK-NEXT: mov z0.h, z0.h[2]
192-
; CHECK-NEXT: str h1, [sp, #10]
193-
; CHECK-NEXT: str h0, [sp, #8]
194-
; CHECK-NEXT: ldr d0, [sp, #8]
195-
; CHECK-NEXT: add sp, sp, #16
189+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI12_0]
190+
; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h
191+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
196192
; CHECK-NEXT: ret
197193
%ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2)
198194
ret <2 x half> %ret

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll

Lines changed: 10 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -81,42 +81,22 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
8181
define void @alloc_v32i8(ptr %st_ptr) nounwind {
8282
; CHECK-LABEL: alloc_v32i8:
8383
; CHECK: // %bb.0:
84-
; CHECK-NEXT: sub sp, sp, #64
85-
; CHECK-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill
84+
; CHECK-NEXT: sub sp, sp, #48
85+
; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill
8686
; CHECK-NEXT: mov x19, x0
87-
; CHECK-NEXT: add x0, sp, #16
87+
; CHECK-NEXT: mov x0, sp
8888
; CHECK-NEXT: bl def
89-
; CHECK-NEXT: ldp q0, q3, [sp, #16]
90-
; CHECK-NEXT: mov z1.b, z0.b[14]
91-
; CHECK-NEXT: fmov w8, s0
92-
; CHECK-NEXT: mov z4.b, z0.b[10]
93-
; CHECK-NEXT: mov z2.b, z0.b[12]
94-
; CHECK-NEXT: mov z5.b, z0.b[8]
95-
; CHECK-NEXT: strb w8, [sp]
96-
; CHECK-NEXT: fmov w8, s1
97-
; CHECK-NEXT: mov z1.b, z0.b[6]
98-
; CHECK-NEXT: fmov w9, s2
99-
; CHECK-NEXT: mov z2.b, z0.b[4]
100-
; CHECK-NEXT: mov z0.b, z0.b[2]
101-
; CHECK-NEXT: strb w8, [sp, #7]
102-
; CHECK-NEXT: fmov w8, s4
103-
; CHECK-NEXT: strb w9, [sp, #6]
104-
; CHECK-NEXT: fmov w9, s5
105-
; CHECK-NEXT: strb w8, [sp, #5]
89+
; CHECK-NEXT: adrp x8, .LCPI2_0
90+
; CHECK-NEXT: ldr q0, [sp]
91+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI2_0]
92+
; CHECK-NEXT: tbl z0.b, { z0.b }, z1.b
93+
; CHECK-NEXT: ldr q1, [sp, #16]
10694
; CHECK-NEXT: fmov w8, s1
107-
; CHECK-NEXT: strb w9, [sp, #4]
108-
; CHECK-NEXT: strb w8, [sp, #3]
109-
; CHECK-NEXT: fmov w8, s2
110-
; CHECK-NEXT: strb w8, [sp, #2]
111-
; CHECK-NEXT: fmov w8, s0
112-
; CHECK-NEXT: strb w8, [sp, #1]
113-
; CHECK-NEXT: fmov w8, s3
11495
; CHECK-NEXT: strb w8, [x19, #8]
115-
; CHECK-NEXT: ldr q0, [sp]
11696
; CHECK-NEXT: fmov x8, d0
11797
; CHECK-NEXT: str x8, [x19]
118-
; CHECK-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload
119-
; CHECK-NEXT: add sp, sp, #64
98+
; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
99+
; CHECK-NEXT: add sp, sp, #48
120100
; CHECK-NEXT: ret
121101
%alloc = alloca [32 x i8]
122102
call void @def(ptr %alloc)

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-rev.ll

Lines changed: 10 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -185,19 +185,11 @@ define void @test_revhv32i16(ptr %a) {
185185
define void @test_rev_elts_fail(ptr %a) {
186186
; CHECK-LABEL: test_rev_elts_fail:
187187
; CHECK: // %bb.0:
188-
; CHECK-NEXT: ldp q1, q0, [x0]
189-
; CHECK-NEXT: mov z2.d, z0.d[1]
190-
; CHECK-NEXT: fmov x8, d0
191-
; CHECK-NEXT: mov z0.d, z1.d[1]
192-
; CHECK-NEXT: fmov x9, d2
193-
; CHECK-NEXT: stp x9, x8, [sp, #-32]!
194-
; CHECK-NEXT: .cfi_def_cfa_offset 32
195-
; CHECK-NEXT: fmov x8, d1
196-
; CHECK-NEXT: fmov x9, d0
197-
; CHECK-NEXT: stp x9, x8, [sp, #16]
198-
; CHECK-NEXT: ldp q1, q0, [sp]
199-
; CHECK-NEXT: stp q0, q1, [x0]
200-
; CHECK-NEXT: add sp, sp, #32
188+
; CHECK-NEXT: index z0.d, #1, #-1
189+
; CHECK-NEXT: ldp q1, q2, [x0]
190+
; CHECK-NEXT: tbl z1.d, { z1.d }, z0.d
191+
; CHECK-NEXT: tbl z0.d, { z2.d }, z0.d
192+
; CHECK-NEXT: stp q1, q0, [x0]
201193
; CHECK-NEXT: ret
202194
%tmp1 = load <4 x i64>, ptr %a
203195
%tmp2 = shufflevector <4 x i64> %tmp1, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
@@ -240,30 +232,11 @@ define void @test_revdv4f64_sve2p1(ptr %a) #1 {
240232
define void @test_revv8i32(ptr %a) {
241233
; CHECK-LABEL: test_revv8i32:
242234
; CHECK: // %bb.0:
243-
; CHECK-NEXT: sub sp, sp, #32
244-
; CHECK-NEXT: .cfi_def_cfa_offset 32
245-
; CHECK-NEXT: ldp q0, q3, [x0]
246-
; CHECK-NEXT: mov z1.s, z0.s[1]
247-
; CHECK-NEXT: mov z2.s, z0.s[2]
248-
; CHECK-NEXT: mov z4.s, z0.s[3]
249-
; CHECK-NEXT: fmov w8, s0
250-
; CHECK-NEXT: mov z0.s, z3.s[1]
251-
; CHECK-NEXT: fmov w9, s1
252-
; CHECK-NEXT: mov z1.s, z3.s[2]
253-
; CHECK-NEXT: stp w9, w8, [sp, #24]
254-
; CHECK-NEXT: fmov w8, s2
255-
; CHECK-NEXT: fmov w9, s4
256-
; CHECK-NEXT: mov z2.s, z3.s[3]
257-
; CHECK-NEXT: stp w9, w8, [sp, #16]
258-
; CHECK-NEXT: fmov w8, s3
259-
; CHECK-NEXT: fmov w9, s0
260-
; CHECK-NEXT: stp w9, w8, [sp, #8]
261-
; CHECK-NEXT: fmov w8, s1
262-
; CHECK-NEXT: fmov w9, s2
263-
; CHECK-NEXT: stp w9, w8, [sp]
264-
; CHECK-NEXT: ldp q0, q1, [sp]
265-
; CHECK-NEXT: stp q0, q1, [x0]
266-
; CHECK-NEXT: add sp, sp, #32
235+
; CHECK-NEXT: index z0.s, #3, #-1
236+
; CHECK-NEXT: ldp q2, q1, [x0]
237+
; CHECK-NEXT: tbl z1.s, { z1.s }, z0.s
238+
; CHECK-NEXT: tbl z0.s, { z2.s }, z0.s
239+
; CHECK-NEXT: stp q1, q0, [x0]
267240
; CHECK-NEXT: ret
268241
%tmp1 = load <8 x i32>, ptr %a
269242
%tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-permute-zip-uzp-trn.ll

Lines changed: 35 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -342,21 +342,14 @@ define void @zip_v4i32(ptr %a, ptr %b) {
342342
define void @zip1_v8i32_undef(ptr %a) {
343343
; CHECK-LABEL: zip1_v8i32_undef:
344344
; CHECK: // %bb.0:
345-
; CHECK-NEXT: sub sp, sp, #16
346-
; CHECK-NEXT: .cfi_def_cfa_offset 16
345+
; CHECK-NEXT: adrp x8, .LCPI6_0
347346
; CHECK-NEXT: ldr q0, [x0, #16]
348347
; CHECK-NEXT: ldr q0, [x0]
349-
; CHECK-NEXT: mov z1.s, z0.s[3]
350-
; CHECK-NEXT: mov z2.s, z0.s[2]
348+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI6_0]
349+
; CHECK-NEXT: tbl z1.s, { z0.s }, z1.s
351350
; CHECK-NEXT: zip1 z0.s, z0.s, z0.s
352-
; CHECK-NEXT: fmov w8, s1
353-
; CHECK-NEXT: fmov w9, s2
354-
; CHECK-NEXT: stp w8, w8, [sp, #8]
355-
; CHECK-NEXT: stp w9, w9, [sp]
356-
; CHECK-NEXT: ldr q1, [sp]
357-
; CHECK-NEXT: str q0, [x0]
358351
; CHECK-NEXT: str q1, [x0, #16]
359-
; CHECK-NEXT: add sp, sp, #16
352+
; CHECK-NEXT: str q0, [x0]
360353
; CHECK-NEXT: ret
361354
%tmp1 = load volatile <8 x i32>, ptr %a
362355
%tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
@@ -389,41 +382,15 @@ define void @trn_v32i8(ptr %a, ptr %b) {
389382
define void @trn_v8i16(ptr %a, ptr %b) {
390383
; CHECK-LABEL: trn_v8i16:
391384
; CHECK: // %bb.0:
385+
; CHECK-NEXT: adrp x8, .LCPI8_0
386+
; CHECK-NEXT: adrp x9, .LCPI8_1
392387
; CHECK-NEXT: ldr q0, [x0]
393-
; CHECK-NEXT: fmov w8, s0
394-
; CHECK-NEXT: mov z1.h, z0.h[3]
395-
; CHECK-NEXT: mov z2.h, z0.h[1]
396-
; CHECK-NEXT: mov z3.h, z0.h[5]
397-
; CHECK-NEXT: mov z4.h, z0.h[4]
398-
; CHECK-NEXT: strh w8, [sp, #-32]!
399-
; CHECK-NEXT: .cfi_def_cfa_offset 32
400-
; CHECK-NEXT: fmov w8, s1
401-
; CHECK-NEXT: mov z1.h, z0.h[2]
402-
; CHECK-NEXT: fmov w9, s2
403-
; CHECK-NEXT: mov z2.h, z0.h[6]
404-
; CHECK-NEXT: mov z0.h, z0.h[7]
405-
; CHECK-NEXT: fmov w10, s3
406-
; CHECK-NEXT: fmov w11, s4
407-
; CHECK-NEXT: fmov w12, s1
408-
; CHECK-NEXT: strh w8, [sp, #14]
409-
; CHECK-NEXT: fmov w13, s2
410-
; CHECK-NEXT: strh w9, [sp, #12]
411-
; CHECK-NEXT: strh w10, [sp, #10]
412-
; CHECK-NEXT: strh w12, [sp, #4]
413-
; CHECK-NEXT: fmov w12, s0
414-
; CHECK-NEXT: strh w11, [sp, #8]
415-
; CHECK-NEXT: strh w13, [sp, #6]
416-
; CHECK-NEXT: strh w12, [sp, #2]
417-
; CHECK-NEXT: strh w12, [sp, #28]
418-
; CHECK-NEXT: strh w11, [sp, #26]
419-
; CHECK-NEXT: strh w10, [sp, #22]
420-
; CHECK-NEXT: strh w8, [sp, #20]
421-
; CHECK-NEXT: strh w13, [sp, #18]
422-
; CHECK-NEXT: strh w9, [sp, #16]
423-
; CHECK-NEXT: ldp q0, q1, [sp]
424-
; CHECK-NEXT: add z0.h, z0.h, z1.h
388+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI8_0]
389+
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI8_1]
390+
; CHECK-NEXT: tbl z1.h, { z0.h }, z1.h
391+
; CHECK-NEXT: tbl z0.h, { z0.h }, z2.h
392+
; CHECK-NEXT: add z0.h, z1.h, z0.h
425393
; CHECK-NEXT: str q0, [x0]
426-
; CHECK-NEXT: add sp, sp, #32
427394
; CHECK-NEXT: ret
428395
%tmp1 = load <8 x i16>, ptr %a
429396
%tmp2 = load <8 x i16>, ptr %b
@@ -692,21 +659,14 @@ define void @zip2_v8i32(ptr %a, ptr %b) #0{
692659
define void @zip2_v8i32_undef(ptr %a) #0{
693660
; CHECK-LABEL: zip2_v8i32_undef:
694661
; CHECK: // %bb.0:
695-
; CHECK-NEXT: sub sp, sp, #16
696-
; CHECK-NEXT: .cfi_def_cfa_offset 16
662+
; CHECK-NEXT: adrp x8, .LCPI17_0
697663
; CHECK-NEXT: ldr q0, [x0]
698664
; CHECK-NEXT: ldr q0, [x0, #16]
699-
; CHECK-NEXT: mov z1.s, z0.s[3]
700-
; CHECK-NEXT: mov z2.s, z0.s[2]
665+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0]
666+
; CHECK-NEXT: tbl z1.s, { z0.s }, z1.s
701667
; CHECK-NEXT: zip1 z0.s, z0.s, z0.s
702-
; CHECK-NEXT: fmov w8, s1
703-
; CHECK-NEXT: fmov w9, s2
704-
; CHECK-NEXT: stp w8, w8, [sp, #8]
705-
; CHECK-NEXT: stp w9, w9, [sp]
706-
; CHECK-NEXT: ldr q1, [sp]
707-
; CHECK-NEXT: str q0, [x0]
708668
; CHECK-NEXT: str q1, [x0, #16]
709-
; CHECK-NEXT: add sp, sp, #16
669+
; CHECK-NEXT: str q0, [x0]
710670
; CHECK-NEXT: ret
711671
%tmp1 = load volatile <8 x i32>, ptr %a
712672
%tmp2 = shufflevector <8 x i32> %tmp1, <8 x i32> undef, <8 x i32> <i32 4, i32 4, i32 5, i32 5, i32 6, i32 6, i32 7, i32 7>
@@ -921,26 +881,15 @@ define void @uzp_v32i8(ptr %a, ptr %b) #0{
921881
define void @uzp_v4i16(ptr %a, ptr %b) #0{
922882
; CHECK-LABEL: uzp_v4i16:
923883
; CHECK: // %bb.0:
884+
; CHECK-NEXT: adrp x8, .LCPI19_0
885+
; CHECK-NEXT: adrp x9, .LCPI19_1
924886
; CHECK-NEXT: ldr d0, [x0]
925-
; CHECK-NEXT: mov z1.h, z0.h[1]
926-
; CHECK-NEXT: fmov w8, s0
927-
; CHECK-NEXT: mov z2.h, z0.h[2]
928-
; CHECK-NEXT: mov z3.h, z0.h[3]
929-
; CHECK-NEXT: fmov w9, s1
930-
; CHECK-NEXT: strh w8, [sp, #-16]!
931-
; CHECK-NEXT: .cfi_def_cfa_offset 16
932-
; CHECK-NEXT: fmov w10, s2
933-
; CHECK-NEXT: fmov w11, s3
934-
; CHECK-NEXT: strh w9, [sp, #6]
935-
; CHECK-NEXT: strh w8, [sp, #10]
936-
; CHECK-NEXT: strh w9, [sp, #8]
937-
; CHECK-NEXT: strh w10, [sp, #4]
938-
; CHECK-NEXT: strh w11, [sp, #2]
939-
; CHECK-NEXT: strh w10, [sp, #12]
940-
; CHECK-NEXT: ldp d0, d1, [sp]
941-
; CHECK-NEXT: add z0.h, z0.h, z1.h
887+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI19_0]
888+
; CHECK-NEXT: ldr q2, [x9, :lo12:.LCPI19_1]
889+
; CHECK-NEXT: tbl z1.h, { z0.h }, z1.h
890+
; CHECK-NEXT: tbl z0.h, { z0.h }, z2.h
891+
; CHECK-NEXT: add z0.h, z1.h, z0.h
942892
; CHECK-NEXT: str d0, [x0]
943-
; CHECK-NEXT: add sp, sp, #16
944893
; CHECK-NEXT: ret
945894
%tmp1 = load <4 x i16>, ptr %a
946895
%tmp2 = load <4 x i16>, ptr %b
@@ -1071,11 +1020,12 @@ define void @uzp_v16i16(ptr %a, ptr %b) #0{
10711020
define void @uzp_v8f32(ptr %a, ptr %b) #0{
10721021
; CHECK-LABEL: uzp_v8f32:
10731022
; CHECK: // %bb.0:
1074-
; CHECK-NEXT: sub sp, sp, #64
1075-
; CHECK-NEXT: .cfi_def_cfa_offset 64
1023+
; CHECK-NEXT: sub sp, sp, #48
1024+
; CHECK-NEXT: .cfi_def_cfa_offset 48
10761025
; CHECK-NEXT: ldp q2, q0, [x0]
1077-
; CHECK-NEXT: ptrue p0.s, vl4
1026+
; CHECK-NEXT: adrp x8, .LCPI21_0
10781027
; CHECK-NEXT: ldp q4, q1, [x1]
1028+
; CHECK-NEXT: ptrue p0.s, vl4
10791029
; CHECK-NEXT: mov z3.s, z0.s[2]
10801030
; CHECK-NEXT: mov z5.s, z1.s[2]
10811031
; CHECK-NEXT: stp s0, s3, [sp, #24]
@@ -1085,17 +1035,17 @@ define void @uzp_v8f32(ptr %a, ptr %b) #0{
10851035
; CHECK-NEXT: mov z0.s, z0.s[1]
10861036
; CHECK-NEXT: stp s3, s1, [sp, #4]
10871037
; CHECK-NEXT: mov z1.s, z2.s[1]
1088-
; CHECK-NEXT: stp s0, s5, [sp, #40]
1089-
; CHECK-NEXT: mov z5.s, z4.s[3]
1090-
; CHECK-NEXT: mov z4.s, z4.s[1]
1038+
; CHECK-NEXT: str s5, [sp, #44]
1039+
; CHECK-NEXT: ldr q5, [x8, :lo12:.LCPI21_0]
1040+
; CHECK-NEXT: str s0, [sp, #40]
10911041
; CHECK-NEXT: ldp q3, q2, [sp]
1042+
; CHECK-NEXT: tbl z0.s, { z4.s }, z5.s
10921043
; CHECK-NEXT: str s1, [sp, #32]
1093-
; CHECK-NEXT: stp s4, s5, [sp, #48]
1094-
; CHECK-NEXT: ldp q0, q1, [sp, #32]
1095-
; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z2.s
1096-
; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z3.s
1097-
; CHECK-NEXT: stp q0, q1, [x0]
1098-
; CHECK-NEXT: add sp, sp, #64
1044+
; CHECK-NEXT: ldr q1, [sp, #32]
1045+
; CHECK-NEXT: fadd z1.s, p0/m, z1.s, z2.s
1046+
; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z3.s
1047+
; CHECK-NEXT: stp q1, q0, [x0]
1048+
; CHECK-NEXT: add sp, sp, #48
10991049
; CHECK-NEXT: ret
11001050
%tmp1 = load <8 x float>, ptr %a
11011051
%tmp2 = load <8 x float>, ptr %b

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-vector-shuffle.ll

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,11 @@ target triple = "aarch64-unknown-linux-gnu"
88
define <4 x i8> @shuffle_ext_byone_v4i8(<4 x i8> %op1, <4 x i8> %op2) {
99
; CHECK-LABEL: shuffle_ext_byone_v4i8:
1010
; CHECK: // %bb.0:
11-
; CHECK-NEXT: sub sp, sp, #16
12-
; CHECK-NEXT: .cfi_def_cfa_offset 16
11+
; CHECK-NEXT: adrp x8, .LCPI0_0
1312
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
14-
; CHECK-NEXT: mov z1.h, z0.h[1]
15-
; CHECK-NEXT: fmov w8, s0
16-
; CHECK-NEXT: mov z2.h, z0.h[2]
17-
; CHECK-NEXT: mov z3.h, z0.h[3]
18-
; CHECK-NEXT: strh w8, [sp, #8]
19-
; CHECK-NEXT: fmov w8, s1
20-
; CHECK-NEXT: fmov w9, s2
21-
; CHECK-NEXT: strh w8, [sp, #14]
22-
; CHECK-NEXT: fmov w8, s3
23-
; CHECK-NEXT: strh w9, [sp, #12]
24-
; CHECK-NEXT: strh w8, [sp, #10]
25-
; CHECK-NEXT: ldr d0, [sp, #8]
26-
; CHECK-NEXT: add sp, sp, #16
13+
; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI0_0]
14+
; CHECK-NEXT: tbl z0.h, { z0.h }, z1.h
15+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
2716
; CHECK-NEXT: ret
2817
%ret = shufflevector <4 x i8> %op1, <4 x i8> %op2, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
2918
ret <4 x i8> %ret

0 commit comments

Comments
 (0)