Skip to content

Commit 3a4691f

Browse files
committed
[AArch64] Add custom store lowering for 256 bit non-temporal stores.
Currently we fail to lower non-termporal stores for 256+ bit vectors to STNPQ, because type legalization will split them up to 128 bit stores and because there are no single non-temporal stores, creating STPNQ in the Load/Store optimizer would be quite tricky. This patch adds custom lowering for 256 bit non-temporal vector stores to improve the generated code. Reviewers: dmgreen, samparker, t.p.northover, ab Reviewed By: dmgreen Differential Revision: https://reviews.llvm.org/D72919 (Cherry-picked from 535ed62)
1 parent cd79ca8 commit 3a4691f

File tree

4 files changed

+188
-5
lines changed

4 files changed

+188
-5
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -527,6 +527,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
527527
setOperationAction(ISD::LOAD, MVT::i128, Custom);
528528
setOperationAction(ISD::STORE, MVT::i128, Custom);
529529

530+
// 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
531+
// custom lowering, as there are no un-paired non-temporal stores and
532+
// legalization will break up 256 bit inputs.
533+
setOperationAction(ISD::STORE, MVT::v32i8, Custom);
534+
setOperationAction(ISD::STORE, MVT::v16i16, Custom);
535+
setOperationAction(ISD::STORE, MVT::v16f16, Custom);
536+
setOperationAction(ISD::STORE, MVT::v8i32, Custom);
537+
setOperationAction(ISD::STORE, MVT::v8f32, Custom);
538+
setOperationAction(ISD::STORE, MVT::v4f64, Custom);
539+
setOperationAction(ISD::STORE, MVT::v4i64, Custom);
540+
530541
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
531542
// This requires the Performance Monitors extension.
532543
if (Subtarget->hasPerfMon())
@@ -1385,6 +1396,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
13851396
case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
13861397
case AArch64ISD::LDP: return "AArch64ISD::LDP";
13871398
case AArch64ISD::STP: return "AArch64ISD::STP";
1399+
case AArch64ISD::STNP: return "AArch64ISD::STNP";
13881400
}
13891401
return nullptr;
13901402
}
@@ -3073,6 +3085,30 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
30733085
if (StoreNode->isTruncatingStore()) {
30743086
return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
30753087
}
3088+
// 256 bit non-temporal stores can be lowered to STNP. Do this as part of
3089+
// the custom lowering, as there are no un-paired non-temporal stores and
3090+
// legalization will break up 256 bit inputs.
3091+
if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
3092+
MemVT.getVectorElementCount().Min % 2u == 0 &&
3093+
((MemVT.getScalarSizeInBits() == 8u ||
3094+
MemVT.getScalarSizeInBits() == 16u ||
3095+
MemVT.getScalarSizeInBits() == 32u ||
3096+
MemVT.getScalarSizeInBits() == 64u))) {
3097+
SDValue Lo =
3098+
DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
3099+
MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
3100+
StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
3101+
SDValue Hi = DAG.getNode(
3102+
ISD::EXTRACT_SUBVECTOR, Dl,
3103+
MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
3104+
StoreNode->getValue(),
3105+
DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
3106+
SDValue Result = DAG.getMemIntrinsicNode(
3107+
AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
3108+
{StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
3109+
StoreNode->getMemoryVT(), StoreNode->getMemOperand());
3110+
return Result;
3111+
}
30763112
} else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
30773113
assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
30783114
SDValue Lo =

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,8 @@ enum NodeType : unsigned {
277277
STZ2G,
278278

279279
LDP,
280-
STP
280+
STP,
281+
STNP
281282
};
282283

283284
} // end namespace AArch64ISD

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,7 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
245245

246246
def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
247247
def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
248+
def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
248249

249250
// Generates the general dynamic sequences, i.e.
250251
// adrp x0, :tlsdesc:var
@@ -544,6 +545,7 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
544545

545546
def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
546547
def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
548+
def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
547549

548550
def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
549551

@@ -2809,6 +2811,10 @@ defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
28092811
def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
28102812
(STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;
28112813

2814+
def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)),
2815+
(STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>;
2816+
2817+
28122818
//---
28132819
// (Register offset)
28142820

llvm/test/CodeGen/AArch64/nontemporal.ll

Lines changed: 144 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,7 @@
22

33
define void @test_stnp_v4i64(<4 x i64>* %p, <4 x i64> %v) #0 {
44
; CHECK-LABEL: test_stnp_v4i64:
5-
; CHECK-NEXT: mov d[[HI1:[0-9]+]], v1[1]
6-
; CHECK-NEXT: mov d[[HI0:[0-9]+]], v0[1]
7-
; CHECK-NEXT: stnp d1, d[[HI1]], [x0, #16]
8-
; CHECK-NEXT: stnp d0, d[[HI0]], [x0]
5+
; CHECK-NEXT: stnp q0, q1, [x0]
96
; CHECK-NEXT: ret
107
store <4 x i64> %v, <4 x i64>* %p, align 1, !nontemporal !0
118
ret void
@@ -334,6 +331,149 @@ define void @test_stnp_v4f32_offset_alloca_2(<4 x float> %v) #0 {
334331
ret void
335332
}
336333

334+
define void @test_stnp_v32i8(<32 x i8> %v, <32 x i8>* %ptr) {
335+
; CHECK-LABEL: _test_stnp_v32i8:
336+
; CHECK-NEXT: .cfi_startproc
337+
; CHECK-NEXT: stnp q0, q1, [x0]
338+
; CHECK-NEXT: ret
339+
340+
entry:
341+
store <32 x i8> %v, <32 x i8>* %ptr, align 4, !nontemporal !0
342+
ret void
343+
}
344+
345+
define void @test_stnp_v32i16(<32 x i16> %v, <32 x i16>* %ptr) {
346+
; CHECK-LABEL: _test_stnp_v32i16:
347+
; CHECK-NEXT: .cfi_startproc
348+
; CHECK-NEXT: stnp q2, q3, [x0, #32]
349+
; CHECK-NEXT: stnp q0, q1, [x0]
350+
; CHECK-NEXT: ret
351+
352+
entry:
353+
store <32 x i16> %v, <32 x i16>* %ptr, align 4, !nontemporal !0
354+
ret void
355+
}
356+
357+
define void @test_stnp_v32f16(<32 x half> %v, <32 x half>* %ptr) {
358+
; CHECK-LABEL: _test_stnp_v32f16:
359+
; CHECK-NEXT: .cfi_startproc
360+
; CHECK-NEXT: stnp q2, q3, [x0, #32]
361+
; CHECK-NEXT: stnp q0, q1, [x0]
362+
; CHECK-NEXT: ret
363+
364+
entry:
365+
store <32 x half> %v, <32 x half>* %ptr, align 4, !nontemporal !0
366+
ret void
367+
}
368+
369+
define void @test_stnp_v16i32(<16 x i32> %v, <16 x i32>* %ptr) {
370+
; CHECK-LABEL: _test_stnp_v16i32:
371+
; CHECK-NEXT: .cfi_startproc
372+
; CHECK-NEXT: stnp q2, q3, [x0, #32]
373+
; CHECK-NEXT: stnp q0, q1, [x0]
374+
; CHECK-NEXT: ret
375+
376+
entry:
377+
store <16 x i32> %v, <16 x i32>* %ptr, align 4, !nontemporal !0
378+
ret void
379+
}
380+
381+
define void @test_stnp_v16f32(<16 x float> %v, <16 x float>* %ptr) {
382+
; CHECK-LABEL: _test_stnp_v16f32:
383+
; CHECK-NEXT: .cfi_startproc
384+
; CHECK-NEXT: stnp q2, q3, [x0, #32]
385+
; CHECK-NEXT: stnp q0, q1, [x0]
386+
; CHECK-NEXT: ret
387+
388+
entry:
389+
store <16 x float> %v, <16 x float>* %ptr, align 4, !nontemporal !0
390+
ret void
391+
}
392+
393+
define void @test_stnp_v17f32(<17 x float> %v, <17 x float>* %ptr) {
394+
; CHECK-LABEL: _test_stnp_v17f32:
395+
; CHECK-NEXT: .cfi_startproc
396+
; CHECK-NEXT: ldr s16, [sp, #16]
397+
; CHECK-NEXT: mov.s v0[1], v1[0]
398+
; CHECK-NEXT: mov.s v4[1], v5[0]
399+
; CHECK-NEXT: ldr s1, [sp]
400+
; CHECK-NEXT: add x8, sp, #20
401+
; CHECK-NEXT: ld1.s { v16 }[1], [x8]
402+
; CHECK-NEXT: add x8, sp, #4
403+
; CHECK-NEXT: ld1.s { v1 }[1], [x8]
404+
; CHECK-NEXT: add x8, sp, #24
405+
; CHECK-NEXT: ld1.s { v16 }[2], [x8]
406+
; CHECK-NEXT: add x8, sp, #8
407+
; CHECK-NEXT: ld1.s { v1 }[2], [x8]
408+
; CHECK-NEXT: add x8, sp, #28
409+
; CHECK-NEXT: ld1.s { v16 }[3], [x8]
410+
; CHECK-NEXT: add x8, sp, #12
411+
; CHECK-NEXT: mov.s v0[2], v2[0]
412+
; CHECK-NEXT: ldr s2, [sp, #32]
413+
; CHECK-NEXT: mov.s v4[2], v6[0]
414+
; CHECK-NEXT: mov.s v0[3], v3[0]
415+
; CHECK-NEXT: mov.s v4[3], v7[0]
416+
; CHECK-NEXT: mov d3, v4[1]
417+
; CHECK-NEXT: mov d5, v0[1]
418+
; CHECK-NEXT: ld1.s { v1 }[3], [x8]
419+
; CHECK-NEXT: stnp d4, d3, [x0, #16]
420+
; CHECK-NEXT: stnp d0, d5, [x0]
421+
; CHECK-NEXT: mov d0, v16[1]
422+
; CHECK-NEXT: mov d3, v1[1]
423+
; CHECK-NEXT: stnp d16, d0, [x0, #48]
424+
; CHECK-NEXT: stnp d1, d3, [x0, #32]
425+
; CHECK-NEXT: str s2, [x0, #64]
426+
; CHECK-NEXT: ret
427+
428+
entry:
429+
store <17 x float> %v, <17 x float>* %ptr, align 4, !nontemporal !0
430+
ret void
431+
}
432+
define void @test_stnp_v16i32_invalid_offset(<16 x i32> %v, <16 x i32>* %ptr) {
433+
; CHECK-LABEL: _test_stnp_v16i32_invalid_offset:
434+
; CHECK-NEXT: .cfi_startproc
435+
; CHECK-NEXT: mov w8, #32000
436+
; CHECK-NEXT: mov w9, #32032
437+
; CHECK-NEXT: add x8, x0, x8
438+
; CHECK-NEXT: add x9, x0, x9
439+
; CHECK-NEXT: stnp q2, q3, [x9]
440+
; CHECK-NEXT: stnp q0, q1, [x8]
441+
; CHECK-NEXT: ret
442+
443+
entry:
444+
%gep = getelementptr <16 x i32>, <16 x i32>* %ptr, i32 500
445+
store <16 x i32> %v, <16 x i32>* %gep, align 4, !nontemporal !0
446+
ret void
447+
}
448+
449+
define void @test_stnp_v16f64(<16 x double> %v, <16 x double>* %ptr) {
450+
; CHECK-LABEL: _test_stnp_v16f64:
451+
; CHECK-NEXT: .cfi_startproc
452+
; CHECK-NEXT: stnp q6, q7, [x0, #96]
453+
; CHECK-NEXT: stnp q4, q5, [x0, #64]
454+
; CHECK-NEXT: stnp q2, q3, [x0, #32]
455+
; CHECK-NEXT: stnp q0, q1, [x0]
456+
; CHECK-NEXT: ret
457+
458+
entry:
459+
store <16 x double> %v, <16 x double>* %ptr, align 4, !nontemporal !0
460+
ret void
461+
}
462+
463+
define void @test_stnp_v16i64(<16 x i64> %v, <16 x i64>* %ptr) {
464+
; CHECK-LABEL: _test_stnp_v16i64:
465+
; CHECK-NEXT: .cfi_startproc
466+
; CHECK-NEXT: stnp q6, q7, [x0, #96]
467+
; CHECK-NEXT: stnp q4, q5, [x0, #64]
468+
; CHECK-NEXT: stnp q2, q3, [x0, #32]
469+
; CHECK-NEXT: stnp q0, q1, [x0]
470+
; CHECK-NEXT: ret
471+
472+
entry:
473+
store <16 x i64> %v, <16 x i64>* %ptr, align 4, !nontemporal !0
474+
ret void
475+
}
476+
337477
!0 = !{ i32 1 }
338478

339479
attributes #0 = { nounwind }

0 commit comments

Comments
 (0)