Skip to content

Commit 2c72d90

Browse files
committed
[AArch64-SVE]: Force generating code compatible to streaming mode.
Add a compile-time flag for enabling streaming mode. When streaming mode is enabled, lower basic loads and stores of fixed-width vectors; to generate code that is compatible to streaming mode. Differential Revision: https://reviews.llvm.org/D133433
1 parent 39b9d4f commit 2c72d90

File tree

6 files changed

+523
-12
lines changed

6 files changed

+523
-12
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5754,7 +5754,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
57545754
case ISD::MLOAD:
57555755
return LowerMLOAD(Op, DAG);
57565756
case ISD::LOAD:
5757-
if (useSVEForFixedLengthVectorVT(Op.getValueType()))
5757+
if (useSVEForFixedLengthVectorVT(Op.getValueType(),
5758+
Subtarget->forceStreamingCompatibleSVE()))
57585759
return LowerFixedLengthVectorLoadToSVE(Op, DAG);
57595760
return LowerLOAD(Op, DAG);
57605761
case ISD::ADD:
@@ -11055,7 +11056,8 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
1105511056

1105611057
ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
1105711058

11058-
if (useSVEForFixedLengthVectorVT(VT))
11059+
if (useSVEForFixedLengthVectorVT(VT,
11060+
Subtarget->forceStreamingCompatibleSVE()))
1105911061
return LowerFixedLengthVECTOR_SHUFFLEToSVE(Op, DAG);
1106011062

1106111063
// Convert shuffles that are directly supported on NEON to target-specific
@@ -11745,7 +11747,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
1174511747
SelectionDAG &DAG) const {
1174611748
EVT VT = Op.getValueType();
1174711749

11748-
if (useSVEForFixedLengthVectorVT(VT)) {
11750+
if (useSVEForFixedLengthVectorVT(VT,
11751+
Subtarget->forceStreamingCompatibleSVE())) {
1174911752
if (auto SeqInfo = cast<BuildVectorSDNode>(Op)->isConstantSequence()) {
1175011753
SDLoc DL(Op);
1175111754
EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,8 @@ def UseNegativeImmediates
220220

221221
def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">;
222222

223+
def NotInStreamingSVEMode : Predicate<"!Subtarget->forceStreamingCompatibleSVE()">;
224+
223225
def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
224226
SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
225227
SDTCisInt<1>]>>;
@@ -7132,16 +7134,17 @@ def : Ld1Lane128IdxOpPat<extloadi8, VectorIndexH, v8i16, i32, LD1i8, VectorIndex
71327134

71337135
// Same as above, but the first element is populated using
71347136
// scalar_to_vector + insert_subvector instead of insert_vector_elt.
7135-
class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
7136-
SDPatternOperator ExtLoad, Instruction LD1>
7137-
: Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
7138-
(ResultTy (EXTRACT_SUBREG
7139-
(LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>;
7140-
7141-
def : Ld1Lane128FirstElm<v2i32, v8i16, extloadi16, LD1i16>;
7142-
def : Ld1Lane128FirstElm<v2i32, v16i8, extloadi8, LD1i8>;
7143-
def : Ld1Lane128FirstElm<v4i16, v16i8, extloadi8, LD1i8>;
7137+
let Predicates = [NotInStreamingSVEMode] in {
7138+
class Ld1Lane128FirstElm<ValueType ResultTy, ValueType VecTy,
7139+
SDPatternOperator ExtLoad, Instruction LD1>
7140+
: Pat<(ResultTy (scalar_to_vector (i32 (ExtLoad GPR64sp:$Rn)))),
7141+
(ResultTy (EXTRACT_SUBREG
7142+
(LD1 (VecTy (IMPLICIT_DEF)), 0, GPR64sp:$Rn), dsub))>;
71447143

7144+
def : Ld1Lane128FirstElm<v2i32, v8i16, extloadi16, LD1i16>;
7145+
def : Ld1Lane128FirstElm<v2i32, v16i8, extloadi8, LD1i8>;
7146+
def : Ld1Lane128FirstElm<v4i16, v16i8, extloadi8, LD1i8>;
7147+
}
71457148
class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
71467149
ValueType VTy, ValueType STy, Instruction LD1>
71477150
: Pat<(vector_insert (VTy VecListOne64:$Rd),

llvm/lib/Target/AArch64/AArch64Subtarget.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,10 @@ ReservedRegsForRA("reserve-regs-for-regalloc", cl::desc("Reserve physical "
6565
"Should only be used for testing register allocator."),
6666
cl::CommaSeparated, cl::Hidden);
6767

68+
static cl::opt<bool>
69+
ForceStreamingCompatibleSVE("force-streaming-compatible-sve",
70+
cl::init(false), cl::Hidden);
71+
6872
unsigned AArch64Subtarget::getVectorInsertExtractBaseCost() const {
6973
if (OverrideVectorInsertExtractBaseCost.getNumOccurrences() > 0)
7074
return OverrideVectorInsertExtractBaseCost;
@@ -431,3 +435,11 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
431435
}
432436

433437
bool AArch64Subtarget::useAA() const { return UseAA; }
438+
439+
bool AArch64Subtarget::forceStreamingCompatibleSVE() const {
440+
if (ForceStreamingCompatibleSVE) {
441+
assert((hasSVE() || hasSME()) && "Expected SVE to be available");
442+
return hasSVE() || hasSME();
443+
}
444+
return false;
445+
}

llvm/lib/Target/AArch64/AArch64Subtarget.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,10 +368,15 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
368368
}
369369

370370
bool useSVEForFixedLengthVectors() const {
371+
if (forceStreamingCompatibleSVE())
372+
return true;
373+
371374
// Prefer NEON unless larger SVE registers are available.
372375
return hasSVE() && getMinSVEVectorSizeInBits() >= 256;
373376
}
374377

378+
bool forceStreamingCompatibleSVE() const;
379+
375380
unsigned getVScaleForTuning() const { return VScaleForTuning; }
376381

377382
const char* getChkStkName() const {
Lines changed: 230 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,230 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -force-streaming-compatible-sve < %s | FileCheck %s
3+
4+
target triple = "aarch64-unknown-linux-gnu"
5+
6+
define <4 x i8> @load_v4i8(<4 x i8>* %a) #0 {
7+
; CHECK-LABEL: load_v4i8:
8+
; CHECK: // %bb.0:
9+
; CHECK-NEXT: ptrue p0.h, vl4
10+
; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
11+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
12+
; CHECK-NEXT: ret
13+
%load = load <4 x i8>, <4 x i8>* %a
14+
ret <4 x i8> %load
15+
}
16+
17+
define <8 x i8> @load_v8i8(<8 x i8>* %a) #0 {
18+
; CHECK-LABEL: load_v8i8:
19+
; CHECK: // %bb.0:
20+
; CHECK-NEXT: ldr d0, [x0]
21+
; CHECK-NEXT: ret
22+
%load = load <8 x i8>, <8 x i8>* %a
23+
ret <8 x i8> %load
24+
}
25+
26+
define <16 x i8> @load_v16i8(<16 x i8>* %a) #0 {
27+
; CHECK-LABEL: load_v16i8:
28+
; CHECK: // %bb.0:
29+
; CHECK-NEXT: ldr q0, [x0]
30+
; CHECK-NEXT: ret
31+
%load = load <16 x i8>, <16 x i8>* %a
32+
ret <16 x i8> %load
33+
}
34+
35+
define <32 x i8> @load_v32i8(<32 x i8>* %a) #0 {
36+
; CHECK-LABEL: load_v32i8:
37+
; CHECK: // %bb.0:
38+
; CHECK-NEXT: ldp q0, q1, [x0]
39+
; CHECK-NEXT: ret
40+
%load = load <32 x i8>, <32 x i8>* %a
41+
ret <32 x i8> %load
42+
}
43+
44+
define <2 x i16> @load_v2i16(<2 x i16>* %a) #0 {
45+
; CHECK-LABEL: load_v2i16:
46+
; CHECK: // %bb.0:
47+
; CHECK-NEXT: ldrh w8, [x0, #2]
48+
; CHECK-NEXT: ldrh w9, [x0]
49+
; CHECK-NEXT: fmov s0, w8
50+
; CHECK-NEXT: fmov s1, w9
51+
; CHECK-NEXT: zip1 z0.s, z1.s, z0.s
52+
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
53+
; CHECK-NEXT: ret
54+
%load = load <2 x i16>, <2 x i16>* %a
55+
ret <2 x i16> %load
56+
}
57+
58+
define <2 x half> @load_v2f16(<2 x half>* %a) #0 {
59+
; CHECK-LABEL: load_v2f16:
60+
; CHECK: // %bb.0:
61+
; CHECK-NEXT: ldr s0, [x0]
62+
; CHECK-NEXT: ret
63+
%load = load <2 x half>, <2 x half>* %a
64+
ret <2 x half> %load
65+
}
66+
67+
define <4 x i16> @load_v4i16(<4 x i16>* %a) #0 {
68+
; CHECK-LABEL: load_v4i16:
69+
; CHECK: // %bb.0:
70+
; CHECK-NEXT: ldr d0, [x0]
71+
; CHECK-NEXT: ret
72+
%load = load <4 x i16>, <4 x i16>* %a
73+
ret <4 x i16> %load
74+
}
75+
76+
define <4 x half> @load_v4f16(<4 x half>* %a) #0 {
77+
; CHECK-LABEL: load_v4f16:
78+
; CHECK: // %bb.0:
79+
; CHECK-NEXT: ldr d0, [x0]
80+
; CHECK-NEXT: ret
81+
%load = load <4 x half>, <4 x half>* %a
82+
ret <4 x half> %load
83+
}
84+
85+
define <8 x i16> @load_v8i16(<8 x i16>* %a) #0 {
86+
; CHECK-LABEL: load_v8i16:
87+
; CHECK: // %bb.0:
88+
; CHECK-NEXT: ldr q0, [x0]
89+
; CHECK-NEXT: ret
90+
%load = load <8 x i16>, <8 x i16>* %a
91+
ret <8 x i16> %load
92+
}
93+
94+
define <8 x half> @load_v8f16(<8 x half>* %a) #0 {
95+
; CHECK-LABEL: load_v8f16:
96+
; CHECK: // %bb.0:
97+
; CHECK-NEXT: ldr q0, [x0]
98+
; CHECK-NEXT: ret
99+
%load = load <8 x half>, <8 x half>* %a
100+
ret <8 x half> %load
101+
}
102+
103+
define <16 x i16> @load_v16i16(<16 x i16>* %a) #0 {
104+
; CHECK-LABEL: load_v16i16:
105+
; CHECK: // %bb.0:
106+
; CHECK-NEXT: ldp q0, q1, [x0]
107+
; CHECK-NEXT: ret
108+
%load = load <16 x i16>, <16 x i16>* %a
109+
ret <16 x i16> %load
110+
}
111+
112+
define <16 x half> @load_v16f16(<16 x half>* %a) #0 {
113+
; CHECK-LABEL: load_v16f16:
114+
; CHECK: // %bb.0:
115+
; CHECK-NEXT: ldp q0, q1, [x0]
116+
; CHECK-NEXT: ret
117+
%load = load <16 x half>, <16 x half>* %a
118+
ret <16 x half> %load
119+
}
120+
121+
define <2 x i32> @load_v2i32(<2 x i32>* %a) #0 {
122+
; CHECK-LABEL: load_v2i32:
123+
; CHECK: // %bb.0:
124+
; CHECK-NEXT: ldr d0, [x0]
125+
; CHECK-NEXT: ret
126+
%load = load <2 x i32>, <2 x i32>* %a
127+
ret <2 x i32> %load
128+
}
129+
130+
define <2 x float> @load_v2f32(<2 x float>* %a) #0 {
131+
; CHECK-LABEL: load_v2f32:
132+
; CHECK: // %bb.0:
133+
; CHECK-NEXT: ldr d0, [x0]
134+
; CHECK-NEXT: ret
135+
%load = load <2 x float>, <2 x float>* %a
136+
ret <2 x float> %load
137+
}
138+
139+
define <4 x i32> @load_v4i32(<4 x i32>* %a) #0 {
140+
; CHECK-LABEL: load_v4i32:
141+
; CHECK: // %bb.0:
142+
; CHECK-NEXT: ldr q0, [x0]
143+
; CHECK-NEXT: ret
144+
%load = load <4 x i32>, <4 x i32>* %a
145+
ret <4 x i32> %load
146+
}
147+
148+
define <4 x float> @load_v4f32(<4 x float>* %a) #0 {
149+
; CHECK-LABEL: load_v4f32:
150+
; CHECK: // %bb.0:
151+
; CHECK-NEXT: ldr q0, [x0]
152+
; CHECK-NEXT: ret
153+
%load = load <4 x float>, <4 x float>* %a
154+
ret <4 x float> %load
155+
}
156+
157+
define <8 x i32> @load_v8i32(<8 x i32>* %a) #0 {
158+
; CHECK-LABEL: load_v8i32:
159+
; CHECK: // %bb.0:
160+
; CHECK-NEXT: ldp q0, q1, [x0]
161+
; CHECK-NEXT: ret
162+
%load = load <8 x i32>, <8 x i32>* %a
163+
ret <8 x i32> %load
164+
}
165+
166+
define <8 x float> @load_v8f32(<8 x float>* %a) #0 {
167+
; CHECK-LABEL: load_v8f32:
168+
; CHECK: // %bb.0:
169+
; CHECK-NEXT: ldp q0, q1, [x0]
170+
; CHECK-NEXT: ret
171+
%load = load <8 x float>, <8 x float>* %a
172+
ret <8 x float> %load
173+
}
174+
175+
define <1 x i64> @load_v1i64(<1 x i64>* %a) #0 {
176+
; CHECK-LABEL: load_v1i64:
177+
; CHECK: // %bb.0:
178+
; CHECK-NEXT: ldr d0, [x0]
179+
; CHECK-NEXT: ret
180+
%load = load <1 x i64>, <1 x i64>* %a
181+
ret <1 x i64> %load
182+
}
183+
184+
define <1 x double> @load_v1f64(<1 x double>* %a) #0 {
185+
; CHECK-LABEL: load_v1f64:
186+
; CHECK: // %bb.0:
187+
; CHECK-NEXT: ldr d0, [x0]
188+
; CHECK-NEXT: ret
189+
%load = load <1 x double>, <1 x double>* %a
190+
ret <1 x double> %load
191+
}
192+
193+
define <2 x i64> @load_v2i64(<2 x i64>* %a) #0 {
194+
; CHECK-LABEL: load_v2i64:
195+
; CHECK: // %bb.0:
196+
; CHECK-NEXT: ldr q0, [x0]
197+
; CHECK-NEXT: ret
198+
%load = load <2 x i64>, <2 x i64>* %a
199+
ret <2 x i64> %load
200+
}
201+
202+
define <2 x double> @load_v2f64(<2 x double>* %a) #0 {
203+
; CHECK-LABEL: load_v2f64:
204+
; CHECK: // %bb.0:
205+
; CHECK-NEXT: ldr q0, [x0]
206+
; CHECK-NEXT: ret
207+
%load = load <2 x double>, <2 x double>* %a
208+
ret <2 x double> %load
209+
}
210+
211+
define <4 x i64> @load_v4i64(<4 x i64>* %a) #0 {
212+
; CHECK-LABEL: load_v4i64:
213+
; CHECK: // %bb.0:
214+
; CHECK-NEXT: ldp q0, q1, [x0]
215+
; CHECK-NEXT: ret
216+
%load = load <4 x i64>, <4 x i64>* %a
217+
ret <4 x i64> %load
218+
}
219+
220+
define <4 x double> @load_v4f64(<4 x double>* %a) #0 {
221+
; CHECK-LABEL: load_v4f64:
222+
; CHECK: // %bb.0:
223+
; CHECK-NEXT: ldp q0, q1, [x0]
224+
; CHECK-NEXT: ret
225+
%load = load <4 x double>, <4 x double>* %a
226+
ret <4 x double> %load
227+
}
228+
229+
230+
attributes #0 = { "target-features"="+sve" }

0 commit comments

Comments
 (0)