Skip to content

Commit eb3a671

Browse files
[AArch64] Avoid vector interleave instructions when NEON and SVE are unavailable (#90723)
As the summary suggests, the code incorrectly assumes that it can use NEON or SVE instructions to implement an interleaved load/store operation, even when both features are unavailable in the selected runtime mode.
1 parent a126225 commit eb3a671

File tree

3 files changed

+127
-33
lines changed

3 files changed

+127
-33
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 14 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15979,7 +15979,8 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
1597915979

1598015980
UseScalable = false;
1598115981

15982-
if (!VecTy->isScalableTy() && !Subtarget->hasNEON())
15982+
if (!VecTy->isScalableTy() && !Subtarget->isNeonAvailable() &&
15983+
!Subtarget->useSVEForFixedLengthVectors())
1598315984
return false;
1598415985

1598515986
if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME())
@@ -16003,18 +16004,20 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
1600316004
}
1600416005

1600516006
unsigned VecSize = DL.getTypeSizeInBits(VecTy);
16006-
if (!Subtarget->isNeonAvailable() ||
16007-
(Subtarget->useSVEForFixedLengthVectors() &&
16008-
(VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 ||
16009-
(VecSize < Subtarget->getMinSVEVectorSizeInBits() &&
16010-
isPowerOf2_32(MinElts) && VecSize > 128)))) {
16011-
UseScalable = true;
16012-
return true;
16007+
if (Subtarget->useSVEForFixedLengthVectors()) {
16008+
unsigned MinSVEVectorSize =
16009+
std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u);
16010+
if (VecSize % MinSVEVectorSize == 0 ||
16011+
(VecSize < MinSVEVectorSize && isPowerOf2_32(MinElts) &&
16012+
(!Subtarget->isNeonAvailable() || VecSize > 128))) {
16013+
UseScalable = true;
16014+
return true;
16015+
}
1601316016
}
1601416017

1601516018
// Ensure the total vector size is 64 or a multiple of 128. Types larger than
1601616019
// 128 will be split into multiple interleaved accesses.
16017-
return VecSize == 64 || VecSize % 128 == 0;
16020+
return Subtarget->isNeonAvailable() && (VecSize == 64 || VecSize % 128 == 0);
1601816021
}
1601916022

1602016023
static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) {
@@ -16105,8 +16108,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
1610516108
// "legalize" wide vector types into multiple interleaved accesses as long as
1610616109
// the vector types are divisible by 128.
1610716110
bool UseScalable;
16108-
if (!Subtarget->hasNEON() ||
16109-
!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16111+
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
1611016112
return false;
1611116113

1611216114
unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
@@ -16283,8 +16285,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
1628316285
// Skip if we do not have NEON and skip illegal vector types. We can
1628416286
// "legalize" wide vector types into multiple interleaved accesses as long as
1628516287
// the vector types are divisible by 128.
16286-
if (!Subtarget->hasNEON() ||
16287-
!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
16288+
if (!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
1628816289
return false;
1628916290

1629016291
unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable);

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-ld2-alloca.ll

Lines changed: 84 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
3+
; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
4+
; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
35

46
target triple = "aarch64-unknown-linux-gnu"
57

@@ -28,6 +30,23 @@ define void @alloc_v4i8(ptr %st_ptr) nounwind {
2830
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
2931
; CHECK-NEXT: add sp, sp, #48
3032
; CHECK-NEXT: ret
33+
;
34+
; NONEON-NOSVE-LABEL: alloc_v4i8:
35+
; NONEON-NOSVE: // %bb.0:
36+
; NONEON-NOSVE-NEXT: sub sp, sp, #32
37+
; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
38+
; NONEON-NOSVE-NEXT: mov x19, x0
39+
; NONEON-NOSVE-NEXT: add x0, sp, #12
40+
; NONEON-NOSVE-NEXT: bl def
41+
; NONEON-NOSVE-NEXT: ldr s0, [sp, #12]
42+
; NONEON-NOSVE-NEXT: ushll v0.8h, v0.8b, #0
43+
; NONEON-NOSVE-NEXT: umov w8, v0.h[2]
44+
; NONEON-NOSVE-NEXT: umov w9, v0.h[0]
45+
; NONEON-NOSVE-NEXT: strb w8, [x19, #1]
46+
; NONEON-NOSVE-NEXT: strb w9, [x19]
47+
; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
48+
; NONEON-NOSVE-NEXT: add sp, sp, #32
49+
; NONEON-NOSVE-NEXT: ret
3150
%alloc = alloca [4 x i8]
3251
call void @def(ptr %alloc)
3352
%load = load <4 x i8>, ptr %alloc
@@ -40,38 +59,51 @@ define void @alloc_v6i8(ptr %st_ptr) nounwind {
4059
; CHECK-LABEL: alloc_v6i8:
4160
; CHECK: // %bb.0:
4261
; CHECK-NEXT: sub sp, sp, #48
43-
; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
62+
; CHECK-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill
4463
; CHECK-NEXT: mov x19, x0
4564
; CHECK-NEXT: add x0, sp, #24
46-
; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
47-
; CHECK-NEXT: add x20, sp, #24
4865
; CHECK-NEXT: bl def
49-
; CHECK-NEXT: ptrue p0.b, vl3
50-
; CHECK-NEXT: ptrue p1.s, vl2
51-
; CHECK-NEXT: ld2b { z0.b, z1.b }, p0/z, [x20]
66+
; CHECK-NEXT: ldr d0, [sp, #24]
5267
; CHECK-NEXT: ptrue p0.h, vl4
53-
; CHECK-NEXT: mov z2.b, z1.b[3]
68+
; CHECK-NEXT: ptrue p1.s, vl2
69+
; CHECK-NEXT: mov z1.b, z0.b[3]
70+
; CHECK-NEXT: mov z2.b, z0.b[5]
71+
; CHECK-NEXT: mov z0.b, z0.b[1]
5472
; CHECK-NEXT: fmov w8, s1
55-
; CHECK-NEXT: mov z3.b, z1.b[2]
56-
; CHECK-NEXT: mov z4.b, z1.b[1]
57-
; CHECK-NEXT: strh w8, [sp]
58-
; CHECK-NEXT: fmov w8, s2
59-
; CHECK-NEXT: fmov w9, s3
60-
; CHECK-NEXT: strh w8, [sp, #6]
61-
; CHECK-NEXT: fmov w8, s4
62-
; CHECK-NEXT: strh w9, [sp, #4]
63-
; CHECK-NEXT: strh w8, [sp, #2]
64-
; CHECK-NEXT: add x8, sp, #12
65-
; CHECK-NEXT: ldr d0, [sp]
73+
; CHECK-NEXT: fmov w9, s2
74+
; CHECK-NEXT: strh w8, [sp, #10]
75+
; CHECK-NEXT: fmov w8, s0
76+
; CHECK-NEXT: strh w9, [sp, #12]
77+
; CHECK-NEXT: strh w8, [sp, #8]
78+
; CHECK-NEXT: add x8, sp, #20
79+
; CHECK-NEXT: ldr d0, [sp, #8]
6680
; CHECK-NEXT: st1b { z0.h }, p0, [x8]
6781
; CHECK-NEXT: ld1h { z0.s }, p1/z, [x8]
6882
; CHECK-NEXT: strb w9, [x19, #2]
69-
; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
7083
; CHECK-NEXT: fmov w8, s0
7184
; CHECK-NEXT: strh w8, [x19]
72-
; CHECK-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
85+
; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
7386
; CHECK-NEXT: add sp, sp, #48
7487
; CHECK-NEXT: ret
88+
;
89+
; NONEON-NOSVE-LABEL: alloc_v6i8:
90+
; NONEON-NOSVE: // %bb.0:
91+
; NONEON-NOSVE-NEXT: sub sp, sp, #32
92+
; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #16] // 16-byte Folded Spill
93+
; NONEON-NOSVE-NEXT: mov x19, x0
94+
; NONEON-NOSVE-NEXT: add x0, sp, #8
95+
; NONEON-NOSVE-NEXT: bl def
96+
; NONEON-NOSVE-NEXT: ldr d0, [sp, #8]
97+
; NONEON-NOSVE-NEXT: add x9, x19, #2
98+
; NONEON-NOSVE-NEXT: rev16 v1.16b, v0.16b
99+
; NONEON-NOSVE-NEXT: xtn v1.8b, v1.8h
100+
; NONEON-NOSVE-NEXT: str s1, [sp, #4]
101+
; NONEON-NOSVE-NEXT: ldrh w8, [sp, #4]
102+
; NONEON-NOSVE-NEXT: st1 { v0.b }[5], [x9]
103+
; NONEON-NOSVE-NEXT: strh w8, [x19]
104+
; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #16] // 16-byte Folded Reload
105+
; NONEON-NOSVE-NEXT: add sp, sp, #32
106+
; NONEON-NOSVE-NEXT: ret
75107
%alloc = alloca [6 x i8]
76108
call void @def(ptr %alloc)
77109
%load = load <6 x i8>, ptr %alloc
@@ -100,6 +132,22 @@ define void @alloc_v32i8(ptr %st_ptr) nounwind {
100132
; CHECK-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
101133
; CHECK-NEXT: add sp, sp, #48
102134
; CHECK-NEXT: ret
135+
;
136+
; NONEON-NOSVE-LABEL: alloc_v32i8:
137+
; NONEON-NOSVE: // %bb.0:
138+
; NONEON-NOSVE-NEXT: sub sp, sp, #48
139+
; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #32] // 16-byte Folded Spill
140+
; NONEON-NOSVE-NEXT: mov x19, x0
141+
; NONEON-NOSVE-NEXT: mov x0, sp
142+
; NONEON-NOSVE-NEXT: bl def
143+
; NONEON-NOSVE-NEXT: ldp q0, q1, [sp]
144+
; NONEON-NOSVE-NEXT: add x8, x19, #8
145+
; NONEON-NOSVE-NEXT: xtn v0.8b, v0.8h
146+
; NONEON-NOSVE-NEXT: st1 { v1.b }[0], [x8]
147+
; NONEON-NOSVE-NEXT: str d0, [x19]
148+
; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #32] // 16-byte Folded Reload
149+
; NONEON-NOSVE-NEXT: add sp, sp, #48
150+
; NONEON-NOSVE-NEXT: ret
103151
%alloc = alloca [32 x i8]
104152
call void @def(ptr %alloc)
105153
%load = load <32 x i8>, ptr %alloc
@@ -128,6 +176,22 @@ define void @alloc_v8f64(ptr %st_ptr) nounwind {
128176
; CHECK-NEXT: ldp x20, x19, [sp, #80] // 16-byte Folded Reload
129177
; CHECK-NEXT: add sp, sp, #96
130178
; CHECK-NEXT: ret
179+
;
180+
; NONEON-NOSVE-LABEL: alloc_v8f64:
181+
; NONEON-NOSVE: // %bb.0:
182+
; NONEON-NOSVE-NEXT: sub sp, sp, #80
183+
; NONEON-NOSVE-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
184+
; NONEON-NOSVE-NEXT: mov x19, x0
185+
; NONEON-NOSVE-NEXT: mov x0, sp
186+
; NONEON-NOSVE-NEXT: bl def
187+
; NONEON-NOSVE-NEXT: ldp q1, q0, [sp, #32]
188+
; NONEON-NOSVE-NEXT: ldp q3, q2, [sp]
189+
; NONEON-NOSVE-NEXT: zip1 v0.2d, v1.2d, v0.2d
190+
; NONEON-NOSVE-NEXT: zip1 v1.2d, v3.2d, v2.2d
191+
; NONEON-NOSVE-NEXT: stp q1, q0, [x19]
192+
; NONEON-NOSVE-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
193+
; NONEON-NOSVE-NEXT: add sp, sp, #80
194+
; NONEON-NOSVE-NEXT: ret
131195
%alloc = alloca [8 x double]
132196
call void @def(ptr %alloc)
133197
%load = load <8 x double>, ptr %alloc

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-shuffle.ll

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mattr=+sve -force-streaming-compatible-sve < %s | FileCheck %s
33
; RUN: llc -mattr=+sme -force-streaming-compatible-sve < %s | FileCheck %s
4+
; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s --check-prefix=NONEON-NOSVE
45

56

67
target triple = "aarch64-unknown-linux-gnu"
@@ -14,6 +15,13 @@ define void @hang_when_merging_stores_after_legalisation(ptr %a, <2 x i32> %b) {
1415
; CHECK-NEXT: mov z1.d, z0.d
1516
; CHECK-NEXT: st2w { z0.s, z1.s }, p0, [x0]
1617
; CHECK-NEXT: ret
18+
;
19+
; NONEON-NOSVE-LABEL: hang_when_merging_stores_after_legalisation:
20+
; NONEON-NOSVE: // %bb.0:
21+
; NONEON-NOSVE-NEXT: // kill: def $d0 killed $d0 def $q0
22+
; NONEON-NOSVE-NEXT: dup v0.4s, v0.s[0]
23+
; NONEON-NOSVE-NEXT: stp q0, q0, [x0]
24+
; NONEON-NOSVE-NEXT: ret
1725
%splat = shufflevector <2 x i32> %b, <2 x i32> undef, <8 x i32> zeroinitializer
1826
%interleaved.vec = shufflevector <8 x i32> %splat, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
1927
store <8 x i32> %interleaved.vec, ptr %a, align 4
@@ -28,6 +36,13 @@ define void @interleave_store_without_splat(ptr %a, <4 x i32> %v1, <4 x i32> %v2
2836
; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0_z1 def $z0_z1
2937
; CHECK-NEXT: st2w { z0.s, z1.s }, p0, [x0]
3038
; CHECK-NEXT: ret
39+
;
40+
; NONEON-NOSVE-LABEL: interleave_store_without_splat:
41+
; NONEON-NOSVE: // %bb.0:
42+
; NONEON-NOSVE-NEXT: zip2 v2.4s, v0.4s, v1.4s
43+
; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v1.4s
44+
; NONEON-NOSVE-NEXT: stp q0, q2, [x0]
45+
; NONEON-NOSVE-NEXT: ret
3146
%shuffle = shufflevector <4 x i32> %v1, <4 x i32> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
3247
%interleaved = shufflevector <8 x i32> %shuffle, <8 x i32> undef, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
3348
store <8 x i32> %interleaved, ptr %a, align 1
@@ -46,6 +61,16 @@ define void @interleave_store_legalization(ptr %a, <8 x i32> %v1, <8 x i32> %v2)
4661
; CHECK-NEXT: st2w { z4.s, z5.s }, p0, [x0]
4762
; CHECK-NEXT: st2w { z2.s, z3.s }, p0, [x0, x8, lsl #2]
4863
; CHECK-NEXT: ret
64+
;
65+
; NONEON-NOSVE-LABEL: interleave_store_legalization:
66+
; NONEON-NOSVE: // %bb.0:
67+
; NONEON-NOSVE-NEXT: zip2 v4.4s, v1.4s, v3.4s
68+
; NONEON-NOSVE-NEXT: zip1 v1.4s, v1.4s, v3.4s
69+
; NONEON-NOSVE-NEXT: zip2 v3.4s, v0.4s, v2.4s
70+
; NONEON-NOSVE-NEXT: zip1 v0.4s, v0.4s, v2.4s
71+
; NONEON-NOSVE-NEXT: stp q1, q4, [x0, #32]
72+
; NONEON-NOSVE-NEXT: stp q0, q3, [x0]
73+
; NONEON-NOSVE-NEXT: ret
4974
%interleaved.vec = shufflevector <8 x i32> %v1, <8 x i32> %v2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11,
5075
i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
5176
store <16 x i32> %interleaved.vec, ptr %a, align 4
@@ -57,6 +82,10 @@ define void @crash_when_lowering_extract_shuffle(ptr %dst, i1 %cond) {
5782
; CHECK-LABEL: crash_when_lowering_extract_shuffle:
5883
; CHECK: // %bb.0:
5984
; CHECK-NEXT: ret
85+
;
86+
; NONEON-NOSVE-LABEL: crash_when_lowering_extract_shuffle:
87+
; NONEON-NOSVE: // %bb.0:
88+
; NONEON-NOSVE-NEXT: ret
6089
%broadcast.splat = shufflevector <32 x i1> zeroinitializer, <32 x i1> zeroinitializer, <32 x i32> zeroinitializer
6190
br i1 %cond, label %exit, label %vector.body
6291

0 commit comments

Comments
 (0)