Skip to content

Commit 747bb85

Browse files
committed
[AArch64][SVE] Lower unpredicated loads/stores as LDR/STR with sve-vector-bits=128.
Given the code below: ```cpp svuint8_t foo(uint8_t *x) { return svld1(svptrue_b8(), x); } ``` When compiled with -msve-vector-bits=128 (or vscale_range(1, 1)), we currently generate: ```gas foo: ptrue p0.b ld1b { z0.b }, p0/z, [x0] ret ``` Whereas (on little-endian) we could instead be using LDR as follows: ```gas foo: ldr q0, [x0] ret ``` Besides avoiding the predicate dependency, the above form enables further optimisations such as LDP folds. Likewise for stores.
1 parent f0129f0 commit 747bb85

File tree

2 files changed

+97
-28
lines changed

2 files changed

+97
-28
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23589,6 +23589,31 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
2358923589
return DAG.getMergeValues({Extract, TokenFactor}, DL);
2359023590
}
2359123591

23592+
// Replace scalable loads with fixed loads when vscale_range(1, 1).
23593+
// This enables further optimisations such as LDP folds.
23594+
static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
23595+
const AArch64Subtarget *Subtarget) {
23596+
EVT MemVT = LD->getMemoryVT();
23597+
if (!MemVT.isScalableVector() ||
23598+
Subtarget->getMaxSVEVectorSizeInBits() != AArch64::SVEBitsPerBlock)
23599+
return SDValue();
23600+
23601+
// Skip unpacked types given their different layouts between Neon and SVE.
23602+
if (MemVT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
23603+
return SDValue();
23604+
23605+
SDLoc DL(LD);
23606+
MVT NewVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
23607+
MemVT.getVectorMinNumElements());
23608+
SDValue NewLoad = DAG.getLoad(
23609+
NewVT, DL, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
23610+
LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
23611+
SDValue Insert = convertToScalableVector(DAG, MemVT, NewLoad);
23612+
SDValue TokenFactor = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
23613+
{SDValue(cast<SDNode>(NewLoad), 1)});
23614+
return DAG.getMergeValues({Insert, TokenFactor}, DL);
23615+
}
23616+
2359223617
// Perform TBI simplification if supported by the target and try to break up
2359323618
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
2359423619
// load instructions can be selected.
@@ -23626,6 +23651,9 @@ static SDValue performLOADCombine(SDNode *N,
2362623651
if (SDValue Res = combineV3I8LoadExt(LD, DAG))
2362723652
return Res;
2362823653

23654+
if (SDValue Res = combineVScale1Load(LD, DAG, Subtarget))
23655+
return Res;
23656+
2362923657
if (!LD->isNonTemporal())
2363023658
return SDValue(N, 0);
2363123659

@@ -23884,6 +23912,30 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
2388423912
return Chain;
2388523913
}
2388623914

23915+
// Replace scalable stores with fixed stores when vscale_range(1, 1).
23916+
static SDValue combineVScale1Store(StoreSDNode *ST, SelectionDAG &DAG,
23917+
const AArch64Subtarget *Subtarget) {
23918+
SDValue Value = ST->getValue();
23919+
EVT ValueVT = Value.getValueType();
23920+
if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
23921+
!ValueVT.isScalableVector() ||
23922+
Subtarget->getMaxSVEVectorSizeInBits() != AArch64::SVEBitsPerBlock)
23923+
return SDValue();
23924+
23925+
// Skip unpacked types given their different layouts between Neon and SVE.
23926+
if (ValueVT.getSizeInBits().getKnownMinValue() != AArch64::SVEBitsPerBlock)
23927+
return SDValue();
23928+
23929+
SDLoc DL(ST);
23930+
MVT NewVT = MVT::getVectorVT(ValueVT.getVectorElementType().getSimpleVT(),
23931+
ValueVT.getVectorMinNumElements());
23932+
SDValue NewValue = convertFromScalableVector(DAG, NewVT, Value);
23933+
SDValue NewStore = DAG.getStore(
23934+
ST->getChain(), DL, NewValue, ST->getBasePtr(), ST->getPointerInfo(),
23935+
ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo());
23936+
return NewStore;
23937+
}
23938+
2388723939
static SDValue performSTORECombine(SDNode *N,
2388823940
TargetLowering::DAGCombinerInfo &DCI,
2388923941
SelectionDAG &DAG,
@@ -23918,6 +23970,9 @@ static SDValue performSTORECombine(SDNode *N,
2391823970
if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
2391923971
return Res;
2392023972

23973+
if (SDValue Res = combineVScale1Store(ST, DAG, Subtarget))
23974+
return Res;
23975+
2392123976
// If this is an FP_ROUND followed by a store, fold this into a truncating
2392223977
// store. We can do this even if this is already a truncstore.
2392323978
// We purposefully don't care about legality of the nodes here as we know

llvm/test/CodeGen/AArch64/sve-unpred-loads-stores.ll

Lines changed: 42 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ define <vscale x 16 x i8> @ld_nxv16i8(ptr %0) #0 {
1212
;
1313
; CHECK-128-LABEL: ld_nxv16i8:
1414
; CHECK-128: // %bb.0:
15-
; CHECK-128-NEXT: ldr z0, [x0]
15+
; CHECK-128-NEXT: ldr q0, [x0]
1616
; CHECK-128-NEXT: ret
1717
%2 = load <vscale x 16 x i8>, ptr %0, align 16
1818
ret <vscale x 16 x i8> %2
@@ -26,7 +26,7 @@ define void @st_nxv16i8(ptr %0, <vscale x 16 x i8> %1) #0 {
2626
;
2727
; CHECK-128-LABEL: st_nxv16i8:
2828
; CHECK-128: // %bb.0:
29-
; CHECK-128-NEXT: str z0, [x0]
29+
; CHECK-128-NEXT: str q0, [x0]
3030
; CHECK-128-NEXT: ret
3131
store <vscale x 16 x i8> %1, ptr %0, align 16
3232
ret void
@@ -40,7 +40,7 @@ define <vscale x 8 x i16> @ld_nxv8i16(ptr %0) #0 {
4040
;
4141
; CHECK-128-LABEL: ld_nxv8i16:
4242
; CHECK-128: // %bb.0:
43-
; CHECK-128-NEXT: ldr z0, [x0]
43+
; CHECK-128-NEXT: ldr q0, [x0]
4444
; CHECK-128-NEXT: ret
4545
%2 = load <vscale x 8 x i16>, ptr %0, align 16
4646
ret <vscale x 8 x i16> %2
@@ -54,7 +54,7 @@ define void @st_nxv8i16(ptr %0, <vscale x 8 x i16> %1) #0 {
5454
;
5555
; CHECK-128-LABEL: st_nxv8i16:
5656
; CHECK-128: // %bb.0:
57-
; CHECK-128-NEXT: str z0, [x0]
57+
; CHECK-128-NEXT: str q0, [x0]
5858
; CHECK-128-NEXT: ret
5959
store <vscale x 8 x i16> %1, ptr %0, align 16
6060
ret void
@@ -68,7 +68,7 @@ define <vscale x 4 x i32> @ld_nxv4i32(ptr %0) #0 {
6868
;
6969
; CHECK-128-LABEL: ld_nxv4i32:
7070
; CHECK-128: // %bb.0:
71-
; CHECK-128-NEXT: ldr z0, [x0]
71+
; CHECK-128-NEXT: ldr q0, [x0]
7272
; CHECK-128-NEXT: ret
7373
%2 = load <vscale x 4 x i32>, ptr %0, align 16
7474
ret <vscale x 4 x i32> %2
@@ -82,7 +82,7 @@ define void @st_nxv4i32(ptr %0, <vscale x 4 x i32> %1) #0 {
8282
;
8383
; CHECK-128-LABEL: st_nxv4i32:
8484
; CHECK-128: // %bb.0:
85-
; CHECK-128-NEXT: str z0, [x0]
85+
; CHECK-128-NEXT: str q0, [x0]
8686
; CHECK-128-NEXT: ret
8787
store <vscale x 4 x i32> %1, ptr %0, align 16
8888
ret void
@@ -96,7 +96,7 @@ define <vscale x 2 x i64> @ld_nxv2i64(ptr %0) #0 {
9696
;
9797
; CHECK-128-LABEL: ld_nxv2i64:
9898
; CHECK-128: // %bb.0:
99-
; CHECK-128-NEXT: ldr z0, [x0]
99+
; CHECK-128-NEXT: ldr q0, [x0]
100100
; CHECK-128-NEXT: ret
101101
%2 = load <vscale x 2 x i64>, ptr %0, align 16
102102
ret <vscale x 2 x i64> %2
@@ -110,7 +110,7 @@ define void @st_nxv2i64(ptr %0, <vscale x 2 x i64> %1) #0 {
110110
;
111111
; CHECK-128-LABEL: st_nxv2i64:
112112
; CHECK-128: // %bb.0:
113-
; CHECK-128-NEXT: str z0, [x0]
113+
; CHECK-128-NEXT: str q0, [x0]
114114
; CHECK-128-NEXT: ret
115115
store <vscale x 2 x i64> %1, ptr %0, align 16
116116
ret void
@@ -124,7 +124,7 @@ define <vscale x 8 x half> @ld_nxv8f16(ptr %0) #0 {
124124
;
125125
; CHECK-128-LABEL: ld_nxv8f16:
126126
; CHECK-128: // %bb.0:
127-
; CHECK-128-NEXT: ldr z0, [x0]
127+
; CHECK-128-NEXT: ldr q0, [x0]
128128
; CHECK-128-NEXT: ret
129129
%2 = load <vscale x 8 x half>, ptr %0, align 16
130130
ret <vscale x 8 x half> %2
@@ -138,7 +138,7 @@ define void @st_nxv8f16(ptr %0, <vscale x 8 x half> %1) #0 {
138138
;
139139
; CHECK-128-LABEL: st_nxv8f16:
140140
; CHECK-128: // %bb.0:
141-
; CHECK-128-NEXT: str z0, [x0]
141+
; CHECK-128-NEXT: str q0, [x0]
142142
; CHECK-128-NEXT: ret
143143
store <vscale x 8 x half> %1, ptr %0, align 16
144144
ret void
@@ -152,7 +152,7 @@ define <vscale x 4 x float> @ld_nxv4f32(ptr %0) #0 {
152152
;
153153
; CHECK-128-LABEL: ld_nxv4f32:
154154
; CHECK-128: // %bb.0:
155-
; CHECK-128-NEXT: ldr z0, [x0]
155+
; CHECK-128-NEXT: ldr q0, [x0]
156156
; CHECK-128-NEXT: ret
157157
%2 = load <vscale x 4 x float>, ptr %0, align 16
158158
ret <vscale x 4 x float> %2
@@ -166,7 +166,7 @@ define void @st_nxv4f32(ptr %0, <vscale x 4 x float> %1) #0 {
166166
;
167167
; CHECK-128-LABEL: st_nxv4f32:
168168
; CHECK-128: // %bb.0:
169-
; CHECK-128-NEXT: str z0, [x0]
169+
; CHECK-128-NEXT: str q0, [x0]
170170
; CHECK-128-NEXT: ret
171171
store <vscale x 4 x float> %1, ptr %0, align 16
172172
ret void
@@ -180,7 +180,7 @@ define <vscale x 2 x double> @ld_nxv2f64(ptr %0) #0 {
180180
;
181181
; CHECK-128-LABEL: ld_nxv2f64:
182182
; CHECK-128: // %bb.0:
183-
; CHECK-128-NEXT: ldr z0, [x0]
183+
; CHECK-128-NEXT: ldr q0, [x0]
184184
; CHECK-128-NEXT: ret
185185
%2 = load <vscale x 2 x double>, ptr %0, align 16
186186
ret <vscale x 2 x double> %2
@@ -194,7 +194,7 @@ define void @st_nxv2f64(ptr %0, <vscale x 2 x double> %1) #0 {
194194
;
195195
; CHECK-128-LABEL: st_nxv2f64:
196196
; CHECK-128: // %bb.0:
197-
; CHECK-128-NEXT: str z0, [x0]
197+
; CHECK-128-NEXT: str q0, [x0]
198198
; CHECK-128-NEXT: ret
199199
store <vscale x 2 x double> %1, ptr %0, align 16
200200
ret void
@@ -208,7 +208,8 @@ define <vscale x 16 x i8> @ld_nxv16i8_offset(ptr %0) #0 {
208208
;
209209
; CHECK-128-LABEL: ld_nxv16i8_offset:
210210
; CHECK-128: // %bb.0:
211-
; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
211+
; CHECK-128-NEXT: rdvl x8, #1
212+
; CHECK-128-NEXT: ldr q0, [x0, x8]
212213
; CHECK-128-NEXT: ret
213214
%2 = tail call i64 @llvm.vscale.i64()
214215
%3 = shl nuw nsw i64 %2, 4
@@ -225,7 +226,8 @@ define void @st_nxv16i8_offset(ptr %0, <vscale x 16 x i8> %1) #0 {
225226
;
226227
; CHECK-128-LABEL: st_nxv16i8_offset:
227228
; CHECK-128: // %bb.0:
228-
; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
229+
; CHECK-128-NEXT: rdvl x8, #1
230+
; CHECK-128-NEXT: str q0, [x0, x8]
229231
; CHECK-128-NEXT: ret
230232
%3 = tail call i64 @llvm.vscale.i64()
231233
%4 = shl nuw nsw i64 %3, 4
@@ -242,7 +244,8 @@ define <vscale x 8 x i16> @ld_nxv8i16_offset(ptr %0) #0 {
242244
;
243245
; CHECK-128-LABEL: ld_nxv8i16_offset:
244246
; CHECK-128: // %bb.0:
245-
; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
247+
; CHECK-128-NEXT: rdvl x8, #1
248+
; CHECK-128-NEXT: ldr q0, [x0, x8]
246249
; CHECK-128-NEXT: ret
247250
%2 = tail call i64 @llvm.vscale.i64()
248251
%3 = shl nuw nsw i64 %2, 4
@@ -259,7 +262,8 @@ define void @st_nxv8i16_offset(ptr %0, <vscale x 8 x i16> %1) #0 {
259262
;
260263
; CHECK-128-LABEL: st_nxv8i16_offset:
261264
; CHECK-128: // %bb.0:
262-
; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
265+
; CHECK-128-NEXT: rdvl x8, #1
266+
; CHECK-128-NEXT: str q0, [x0, x8]
263267
; CHECK-128-NEXT: ret
264268
%3 = tail call i64 @llvm.vscale.i64()
265269
%4 = shl nuw nsw i64 %3, 4
@@ -276,7 +280,8 @@ define <vscale x 4 x i32> @ld_nxv4i32_offset(ptr %0) #0 {
276280
;
277281
; CHECK-128-LABEL: ld_nxv4i32_offset:
278282
; CHECK-128: // %bb.0:
279-
; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
283+
; CHECK-128-NEXT: rdvl x8, #1
284+
; CHECK-128-NEXT: ldr q0, [x0, x8]
280285
; CHECK-128-NEXT: ret
281286
%2 = tail call i64 @llvm.vscale.i64()
282287
%3 = shl nuw nsw i64 %2, 4
@@ -293,7 +298,8 @@ define void @st_nxv4i32_offset(ptr %0, <vscale x 4 x i32> %1) #0 {
293298
;
294299
; CHECK-128-LABEL: st_nxv4i32_offset:
295300
; CHECK-128: // %bb.0:
296-
; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
301+
; CHECK-128-NEXT: rdvl x8, #1
302+
; CHECK-128-NEXT: str q0, [x0, x8]
297303
; CHECK-128-NEXT: ret
298304
%3 = tail call i64 @llvm.vscale.i64()
299305
%4 = shl nuw nsw i64 %3, 4
@@ -310,7 +316,8 @@ define <vscale x 2 x i64> @ld_nxv2i64_offset(ptr %0) #0 {
310316
;
311317
; CHECK-128-LABEL: ld_nxv2i64_offset:
312318
; CHECK-128: // %bb.0:
313-
; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
319+
; CHECK-128-NEXT: rdvl x8, #1
320+
; CHECK-128-NEXT: ldr q0, [x0, x8]
314321
; CHECK-128-NEXT: ret
315322
%2 = tail call i64 @llvm.vscale.i64()
316323
%3 = shl nuw nsw i64 %2, 4
@@ -327,7 +334,8 @@ define void @st_nxv2i64_offset(ptr %0, <vscale x 2 x i64> %1) #0 {
327334
;
328335
; CHECK-128-LABEL: st_nxv2i64_offset:
329336
; CHECK-128: // %bb.0:
330-
; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
337+
; CHECK-128-NEXT: rdvl x8, #1
338+
; CHECK-128-NEXT: str q0, [x0, x8]
331339
; CHECK-128-NEXT: ret
332340
%3 = tail call i64 @llvm.vscale.i64()
333341
%4 = shl nuw nsw i64 %3, 4
@@ -344,7 +352,8 @@ define <vscale x 8 x half> @ld_nxv8f16_offset(ptr %0) #0 {
344352
;
345353
; CHECK-128-LABEL: ld_nxv8f16_offset:
346354
; CHECK-128: // %bb.0:
347-
; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
355+
; CHECK-128-NEXT: rdvl x8, #1
356+
; CHECK-128-NEXT: ldr q0, [x0, x8]
348357
; CHECK-128-NEXT: ret
349358
%2 = tail call i64 @llvm.vscale.i64()
350359
%3 = shl nuw nsw i64 %2, 4
@@ -361,7 +370,8 @@ define void @st_nxv8f16_offset(ptr %0, <vscale x 8 x half> %1) #0 {
361370
;
362371
; CHECK-128-LABEL: st_nxv8f16_offset:
363372
; CHECK-128: // %bb.0:
364-
; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
373+
; CHECK-128-NEXT: rdvl x8, #1
374+
; CHECK-128-NEXT: str q0, [x0, x8]
365375
; CHECK-128-NEXT: ret
366376
%3 = tail call i64 @llvm.vscale.i64()
367377
%4 = shl nuw nsw i64 %3, 4
@@ -378,7 +388,8 @@ define <vscale x 4 x float> @ld_nxv4f32_offset(ptr %0) #0 {
378388
;
379389
; CHECK-128-LABEL: ld_nxv4f32_offset:
380390
; CHECK-128: // %bb.0:
381-
; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
391+
; CHECK-128-NEXT: rdvl x8, #1
392+
; CHECK-128-NEXT: ldr q0, [x0, x8]
382393
; CHECK-128-NEXT: ret
383394
%2 = tail call i64 @llvm.vscale.i64()
384395
%3 = shl nuw nsw i64 %2, 4
@@ -395,7 +406,8 @@ define void @st_nxv4f32_offset(ptr %0, <vscale x 4 x float> %1) #0 {
395406
;
396407
; CHECK-128-LABEL: st_nxv4f32_offset:
397408
; CHECK-128: // %bb.0:
398-
; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
409+
; CHECK-128-NEXT: rdvl x8, #1
410+
; CHECK-128-NEXT: str q0, [x0, x8]
399411
; CHECK-128-NEXT: ret
400412
%3 = tail call i64 @llvm.vscale.i64()
401413
%4 = shl nuw nsw i64 %3, 4
@@ -412,7 +424,8 @@ define <vscale x 2 x double> @ld_nxv2f64_offset(ptr %0) #0 {
412424
;
413425
; CHECK-128-LABEL: ld_nxv2f64_offset:
414426
; CHECK-128: // %bb.0:
415-
; CHECK-128-NEXT: ldr z0, [x0, #1, mul vl]
427+
; CHECK-128-NEXT: rdvl x8, #1
428+
; CHECK-128-NEXT: ldr q0, [x0, x8]
416429
; CHECK-128-NEXT: ret
417430
%2 = tail call i64 @llvm.vscale.i64()
418431
%3 = shl nuw nsw i64 %2, 4
@@ -429,7 +442,8 @@ define void @st_nxv2f64_offset(ptr %0, <vscale x 2 x double> %1) #0 {
429442
;
430443
; CHECK-128-LABEL: st_nxv2f64_offset:
431444
; CHECK-128: // %bb.0:
432-
; CHECK-128-NEXT: str z0, [x0, #1, mul vl]
445+
; CHECK-128-NEXT: rdvl x8, #1
446+
; CHECK-128-NEXT: str q0, [x0, x8]
433447
; CHECK-128-NEXT: ret
434448
%3 = tail call i64 @llvm.vscale.i64()
435449
%4 = shl nuw nsw i64 %3, 4

0 commit comments

Comments
 (0)