Skip to content

Commit aa0f37e

Browse files
[AArch64][SVE] Add first-faulting load intrinsic
Summary: Implements the llvm.aarch64.sve.ldff1 intrinsic and DAG combine rules for first-faulting loads with sign & zero extends Reviewers: sdesmalen, efriedma, andwar, dancgr, rengolin Reviewed By: sdesmalen Subscribers: tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, cameron.mcinally, cfe-commits, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D73025
1 parent 4321c6a commit aa0f37e

File tree

7 files changed

+284
-10
lines changed

7 files changed

+284
-10
lines changed

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1176,6 +1176,7 @@ class AdvSIMD_ScatterStore_VectorBase_Intrinsic
11761176
def int_aarch64_sve_ldnt1 : AdvSIMD_1Vec_PredLoad_Intrinsic;
11771177

11781178
def int_aarch64_sve_ldnf1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;
1179+
def int_aarch64_sve_ldff1 : AdvSIMD_1Vec_PredFaultingLoad_Intrinsic;
11791180

11801181
//
11811182
// Stores

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1375,6 +1375,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
13751375
case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE";
13761376
case AArch64ISD::LDNF1: return "AArch64ISD::LDNF1";
13771377
case AArch64ISD::LDNF1S: return "AArch64ISD::LDNF1S";
1378+
case AArch64ISD::LDFF1: return "AArch64ISD::LDFF1";
1379+
case AArch64ISD::LDFF1S: return "AArch64ISD::LDFF1S";
13781380
case AArch64ISD::GLD1: return "AArch64ISD::GLD1";
13791381
case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED";
13801382
case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW";
@@ -10237,6 +10239,7 @@ static SDValue performSVEAndCombine(SDNode *N,
1023710239
// perfect candidates for combining.
1023810240
switch (Src->getOpcode()) {
1023910241
case AArch64ISD::LDNF1:
10242+
case AArch64ISD::LDFF1:
1024010243
MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
1024110244
break;
1024210245
case AArch64ISD::GLD1:
@@ -11298,7 +11301,7 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
1129811301
ISD::UNINDEXED, false, false);
1129911302
}
1130011303

11301-
static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG) {
11304+
static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
1130211305
SDLoc DL(N);
1130311306
EVT VT = N->getValueType(0);
1130411307

@@ -11315,7 +11318,7 @@ static SDValue performLDNF1Combine(SDNode *N, SelectionDAG &DAG) {
1131511318
N->getOperand(3), // Base
1131611319
DAG.getValueType(VT) };
1131711320

11318-
SDValue Load = DAG.getNode(AArch64ISD::LDNF1, DL, VTs, Ops);
11321+
SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
1131911322
SDValue LoadChain = SDValue(Load.getNode(), 1);
1132011323

1132111324
if (ContainerVT.isInteger() && (VT != ContainerVT))
@@ -12571,6 +12574,10 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
1257112574
NewOpc = AArch64ISD::LDNF1S;
1257212575
MemVTOpNum = 3;
1257312576
break;
12577+
case AArch64ISD::LDFF1:
12578+
NewOpc = AArch64ISD::LDFF1S;
12579+
MemVTOpNum = 3;
12580+
break;
1257412581
case AArch64ISD::GLD1:
1257512582
NewOpc = AArch64ISD::GLD1S;
1257612583
break;
@@ -12706,7 +12713,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
1270612713
case Intrinsic::aarch64_sve_ldnt1:
1270712714
return performLDNT1Combine(N, DAG);
1270812715
case Intrinsic::aarch64_sve_ldnf1:
12709-
return performLDNF1Combine(N, DAG);
12716+
return performLDNF1Combine(N, DAG, AArch64ISD::LDNF1);
12717+
case Intrinsic::aarch64_sve_ldff1:
12718+
return performLDNF1Combine(N, DAG, AArch64ISD::LDFF1);
1271012719
case Intrinsic::aarch64_sve_stnt1:
1271112720
return performSTNT1Combine(N, DAG);
1271212721
case Intrinsic::aarch64_sve_ld1_gather:

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,8 @@ enum NodeType : unsigned {
217217

218218
LDNF1,
219219
LDNF1S,
220+
LDFF1,
221+
LDFF1S,
220222

221223
// Unsigned gather loads.
222224
GLD1,

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -549,13 +549,6 @@ def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDN
549549

550550
def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
551551

552-
def SDT_AArch64_LDNF1 : SDTypeProfile<1, 3, [
553-
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
554-
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
555-
]>;
556-
557-
def AArch64ldnf1 : SDNode<"AArch64ISD::LDNF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
558-
559552
//===----------------------------------------------------------------------===//
560553

561554
//===----------------------------------------------------------------------===//

llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,11 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13+
def SDT_AArch64_LDNF1 : SDTypeProfile<1, 3, [
14+
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
15+
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
16+
]>;
17+
1318
def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [
1419
SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
1520
SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
@@ -38,6 +43,8 @@ def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED",
3843
def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
3944
def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
4045

46+
def AArch64ldnf1 : SDNode<"AArch64ISD::LDNF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
47+
def AArch64ldff1 : SDNode<"AArch64ISD::LDFF1", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
4148
def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
4249
def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
4350
def AArch64ld1_gather_uxtw : SDNode<"AArch64ISD::GLD1_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
@@ -58,6 +65,7 @@ def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">;
5865
def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">;
5966

6067
def AArch64ldnf1s : SDNode<"AArch64ISD::LDNF1S", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
68+
def AArch64ldff1s : SDNode<"AArch64ISD::LDFF1S", SDT_AArch64_LDNF1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
6169
def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
6270
def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
6371
def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
@@ -1340,6 +1348,40 @@ let Predicates = [HasSVE] in {
13401348
// 16-element contiguous non-faulting loads
13411349
defm : ldnf1<LDNF1B_IMM, nxv16i8, AArch64ldnf1, nxv16i1, nxv16i8>;
13421350

1351+
multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
1352+
// Add more complex addressing modes here as required.
1353+
// Base
1354+
def : Pat<(Ty (Load (PredTy PPR:$gp), GPR64:$base, MemVT)),
1355+
(I PPR:$gp, GPR64sp:$base, XZR)>;
1356+
}
1357+
1358+
// 2-element contiguous first faulting loads
1359+
defm : ldff1<LDFF1B_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i8>;
1360+
defm : ldff1<LDFF1SB_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i8>;
1361+
defm : ldff1<LDFF1H_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i16>;
1362+
defm : ldff1<LDFF1SH_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i16>;
1363+
defm : ldff1<LDFF1W_D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i32>;
1364+
defm : ldff1<LDFF1SW_D, nxv2i64, AArch64ldff1s, nxv2i1, nxv2i32>;
1365+
defm : ldff1<LDFF1D, nxv2i64, AArch64ldff1, nxv2i1, nxv2i64>;
1366+
defm : ldff1<LDFF1W_D, nxv2f32, AArch64ldff1, nxv2i1, nxv2f32>;
1367+
defm : ldff1<LDFF1D, nxv2f64, AArch64ldff1, nxv2i1, nxv2f64>;
1368+
1369+
// 4-element contiguous first faulting loads
1370+
defm : ldff1<LDFF1B_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i8>;
1371+
defm : ldff1<LDFF1SB_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i8>;
1372+
defm : ldff1<LDFF1H_S, nxv4i32, AArch64ldff1, nxv4i1, nxv4i16>;
1373+
defm : ldff1<LDFF1SH_S, nxv4i32, AArch64ldff1s, nxv4i1, nxv4i16>;
1374+
defm : ldff1<LDFF1W, nxv4i32, AArch64ldff1, nxv4i1, nxv4i32>;
1375+
defm : ldff1<LDFF1W, nxv4f32, AArch64ldff1, nxv4i1, nxv4f32>;
1376+
1377+
// 8-element contiguous first faulting loads
1378+
defm : ldff1<LDFF1B_H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i8>;
1379+
defm : ldff1<LDFF1SB_H, nxv8i16, AArch64ldff1s, nxv8i1, nxv8i8>;
1380+
defm : ldff1<LDFF1H, nxv8i16, AArch64ldff1, nxv8i1, nxv8i16>;
1381+
defm : ldff1<LDFF1H, nxv8f16, AArch64ldff1, nxv8i1, nxv8f16>;
1382+
1383+
// 16-element contiguous first faulting loads
1384+
defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1, nxv16i1, nxv16i8>;
13431385
}
13441386

13451387
let Predicates = [HasSVE2] in {

llvm/lib/Target/AArch64/SVEInstrFormats.td

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5780,6 +5780,13 @@ multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,
57805780

57815781
def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
57825782
(!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
5783+
5784+
// We need a layer of indirection because early machine code passes balk at
5785+
// physical register (i.e. FFR) uses that have no previous definition.
5786+
let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
5787+
def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), []>,
5788+
PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm)>;
5789+
}
57835790
}
57845791

57855792
multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck %s
2+
3+
;
4+
; LDFF1B
5+
;
6+
7+
define <vscale x 16 x i8> @ldff1b(<vscale x 16 x i1> %pg, i8* %a) {
8+
; CHECK-LABEL: ldff1b:
9+
; CHECK: ldff1b { z0.b }, p0/z, [x0]
10+
; CHECK-NEXT: ret
11+
%load = call <vscale x 16 x i8> @llvm.aarch64.sve.ldff1.nxv16i8(<vscale x 16 x i1> %pg, i8* %a)
12+
ret <vscale x 16 x i8> %load
13+
}
14+
15+
define <vscale x 8 x i16> @ldff1b_h(<vscale x 8 x i1> %pg, i8* %a) {
16+
; CHECK-LABEL: ldff1b_h:
17+
; CHECK: ldff1b { z0.h }, p0/z, [x0]
18+
; CHECK-NEXT: ret
19+
%load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a)
20+
%res = zext <vscale x 8 x i8> %load to <vscale x 8 x i16>
21+
ret <vscale x 8 x i16> %res
22+
}
23+
24+
define <vscale x 4 x i32> @ldff1b_s(<vscale x 4 x i1> %pg, i8* %a) {
25+
; CHECK-LABEL: ldff1b_s:
26+
; CHECK: ldff1b { z0.s }, p0/z, [x0]
27+
; CHECK-NEXT: ret
28+
%load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a)
29+
%res = zext <vscale x 4 x i8> %load to <vscale x 4 x i32>
30+
ret <vscale x 4 x i32> %res
31+
}
32+
33+
define <vscale x 2 x i64> @ldff1b_d(<vscale x 2 x i1> %pg, i8* %a) {
34+
; CHECK-LABEL: ldff1b_d:
35+
; CHECK: ldff1b { z0.d }, p0/z, [x0]
36+
; CHECK-NEXT: ret
37+
%load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a)
38+
%res = zext <vscale x 2 x i8> %load to <vscale x 2 x i64>
39+
ret <vscale x 2 x i64> %res
40+
}
41+
42+
;
43+
; LDFF1SB
44+
;
45+
46+
define <vscale x 8 x i16> @ldff1sb_h(<vscale x 8 x i1> %pg, i8* %a) {
47+
; CHECK-LABEL: ldff1sb_h:
48+
; CHECK: ldff1sb { z0.h }, p0/z, [x0]
49+
; CHECK-NEXT: ret
50+
%load = call <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1> %pg, i8* %a)
51+
%res = sext <vscale x 8 x i8> %load to <vscale x 8 x i16>
52+
ret <vscale x 8 x i16> %res
53+
}
54+
55+
define <vscale x 4 x i32> @ldff1sb_s(<vscale x 4 x i1> %pg, i8* %a) {
56+
; CHECK-LABEL: ldff1sb_s:
57+
; CHECK: ldff1sb { z0.s }, p0/z, [x0]
58+
; CHECK-NEXT: ret
59+
%load = call <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1> %pg, i8* %a)
60+
%res = sext <vscale x 4 x i8> %load to <vscale x 4 x i32>
61+
ret <vscale x 4 x i32> %res
62+
}
63+
64+
define <vscale x 2 x i64> @ldff1sb_d(<vscale x 2 x i1> %pg, i8* %a) {
65+
; CHECK-LABEL: ldff1sb_d:
66+
; CHECK: ldff1sb { z0.d }, p0/z, [x0]
67+
; CHECK-NEXT: ret
68+
%load = call <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.nxv2i8(<vscale x 2 x i1> %pg, i8* %a)
69+
%res = sext <vscale x 2 x i8> %load to <vscale x 2 x i64>
70+
ret <vscale x 2 x i64> %res
71+
}
72+
73+
;
74+
; LDFF1H
75+
;
76+
77+
define <vscale x 8 x i16> @ldff1h(<vscale x 8 x i1> %pg, i16* %a) {
78+
; CHECK-LABEL: ldff1h:
79+
; CHECK: ldff1h { z0.h }, p0/z, [x0]
80+
; CHECK-NEXT: ret
81+
%load = call <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1> %pg, i16* %a)
82+
ret <vscale x 8 x i16> %load
83+
}
84+
85+
define <vscale x 4 x i32> @ldff1h_s(<vscale x 4 x i1> %pg, i16* %a) {
86+
; CHECK-LABEL: ldff1h_s:
87+
; CHECK: ldff1h { z0.s }, p0/z, [x0]
88+
; CHECK-NEXT: ret
89+
%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a)
90+
%res = zext <vscale x 4 x i16> %load to <vscale x 4 x i32>
91+
ret <vscale x 4 x i32> %res
92+
}
93+
94+
define <vscale x 2 x i64> @ldff1h_d(<vscale x 2 x i1> %pg, i16* %a) {
95+
; CHECK-LABEL: ldff1h_d:
96+
; CHECK: ldff1h { z0.d }, p0/z, [x0]
97+
; CHECK-NEXT: ret
98+
%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a)
99+
%res = zext <vscale x 2 x i16> %load to <vscale x 2 x i64>
100+
ret <vscale x 2 x i64> %res
101+
}
102+
103+
define <vscale x 8 x half> @ldff1h_f16(<vscale x 8 x i1> %pg, half* %a) {
104+
; CHECK-LABEL: ldff1h_f16:
105+
; CHECK: ldff1h { z0.h }, p0/z, [x0]
106+
; CHECK-NEXT: ret
107+
%load = call <vscale x 8 x half> @llvm.aarch64.sve.ldff1.nxv8f16(<vscale x 8 x i1> %pg, half* %a)
108+
ret <vscale x 8 x half> %load
109+
}
110+
111+
;
112+
; LDFF1SH
113+
;
114+
115+
define <vscale x 4 x i32> @ldff1sh_s(<vscale x 4 x i1> %pg, i16* %a) {
116+
; CHECK-LABEL: ldff1sh_s:
117+
; CHECK: ldff1sh { z0.s }, p0/z, [x0]
118+
; CHECK-NEXT: ret
119+
%load = call <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1> %pg, i16* %a)
120+
%res = sext <vscale x 4 x i16> %load to <vscale x 4 x i32>
121+
ret <vscale x 4 x i32> %res
122+
}
123+
124+
define <vscale x 2 x i64> @ldff1sh_d(<vscale x 2 x i1> %pg, i16* %a) {
125+
; CHECK-LABEL: ldff1sh_d:
126+
; CHECK: ldff1sh { z0.d }, p0/z, [x0]
127+
; CHECK-NEXT: ret
128+
%load = call <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.nxv2i16(<vscale x 2 x i1> %pg, i16* %a)
129+
%res = sext <vscale x 2 x i16> %load to <vscale x 2 x i64>
130+
ret <vscale x 2 x i64> %res
131+
}
132+
133+
;
134+
; LDFF1W
135+
;
136+
137+
define <vscale x 4 x i32> @ldff1w(<vscale x 4 x i1> %pg, i32* %a) {
138+
; CHECK-LABEL: ldff1w:
139+
; CHECK: ldff1w { z0.s }, p0/z, [x0]
140+
; CHECK-NEXT: ret
141+
%load = call <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.nxv4i32(<vscale x 4 x i1> %pg, i32* %a)
142+
ret <vscale x 4 x i32> %load
143+
}
144+
145+
define <vscale x 2 x i64> @ldff1w_d(<vscale x 2 x i1> %pg, i32* %a) {
146+
; CHECK-LABEL: ldff1w_d:
147+
; CHECK: ldff1w { z0.d }, p0/z, [x0]
148+
; CHECK-NEXT: ret
149+
%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a)
150+
%res = zext <vscale x 2 x i32> %load to <vscale x 2 x i64>
151+
ret <vscale x 2 x i64> %res
152+
}
153+
154+
define <vscale x 4 x float> @ldff1w_f32(<vscale x 4 x i1> %pg, float* %a) {
155+
; CHECK-LABEL: ldff1w_f32:
156+
; CHECK: ldff1w { z0.s }, p0/z, [x0]
157+
; CHECK-NEXT: ret
158+
%load = call <vscale x 4 x float> @llvm.aarch64.sve.ldff1.nxv4f32(<vscale x 4 x i1> %pg, float* %a)
159+
ret <vscale x 4 x float> %load
160+
}
161+
162+
define <vscale x 2 x float> @ldff1w_2f32(<vscale x 2 x i1> %pg, float* %a) {
163+
; CHECK-LABEL: ldff1w_2f32:
164+
; CHECK: ldff1w { z0.d }, p0/z, [x0]
165+
; CHECK-NEXT: ret
166+
%load = call <vscale x 2 x float> @llvm.aarch64.sve.ldff1.nxv2f32(<vscale x 2 x i1> %pg, float* %a)
167+
ret <vscale x 2 x float> %load
168+
}
169+
170+
;
171+
; LDFF1SW
172+
;
173+
174+
define <vscale x 2 x i64> @ldff1sw_d(<vscale x 2 x i1> %pg, i32* %a) {
175+
; CHECK-LABEL: ldff1sw_d:
176+
; CHECK: ldff1sw { z0.d }, p0/z, [x0]
177+
; CHECK-NEXT: ret
178+
%load = call <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.nxv2i32(<vscale x 2 x i1> %pg, i32* %a)
179+
%res = sext <vscale x 2 x i32> %load to <vscale x 2 x i64>
180+
ret <vscale x 2 x i64> %res
181+
}
182+
183+
;
184+
; LDFF1D
185+
;
186+
187+
define <vscale x 2 x i64> @ldff1d(<vscale x 2 x i1> %pg, i64* %a) {
188+
; CHECK-LABEL: ldff1d:
189+
; CHECK: ldff1d { z0.d }, p0/z, [x0]
190+
; CHECK-NEXT: ret
191+
%load = call <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.nxv2i64(<vscale x 2 x i1> %pg, i64* %a)
192+
ret <vscale x 2 x i64> %load
193+
}
194+
195+
196+
define <vscale x 2 x double> @ldff1d_f64(<vscale x 2 x i1> %pg, double* %a) {
197+
; CHECK-LABEL: ldff1d_f64:
198+
; CHECK: ldff1d { z0.d }, p0/z, [x0]
199+
; CHECK-NEXT: ret
200+
%load = call <vscale x 2 x double> @llvm.aarch64.sve.ldff1.nxv2f64(<vscale x 2 x i1> %pg, double* %a)
201+
ret <vscale x 2 x double> %load
202+
}
203+
204+
declare <vscale x 16 x i8> @llvm.aarch64.sve.ldff1.nxv16i8(<vscale x 16 x i1>, i8*)
205+
206+
declare <vscale x 8 x i8> @llvm.aarch64.sve.ldff1.nxv8i8(<vscale x 8 x i1>, i8*)
207+
declare <vscale x 8 x i16> @llvm.aarch64.sve.ldff1.nxv8i16(<vscale x 8 x i1>, i16*)
208+
declare <vscale x 8 x half> @llvm.aarch64.sve.ldff1.nxv8f16(<vscale x 8 x i1>, half*)
209+
210+
declare <vscale x 4 x i8> @llvm.aarch64.sve.ldff1.nxv4i8(<vscale x 4 x i1>, i8*)
211+
declare <vscale x 4 x i16> @llvm.aarch64.sve.ldff1.nxv4i16(<vscale x 4 x i1>, i16*)
212+
declare <vscale x 4 x i32> @llvm.aarch64.sve.ldff1.nxv4i32(<vscale x 4 x i1>, i32*)
213+
declare <vscale x 2 x float> @llvm.aarch64.sve.ldff1.nxv2f32(<vscale x 2 x i1>, float*)
214+
declare <vscale x 4 x float> @llvm.aarch64.sve.ldff1.nxv4f32(<vscale x 4 x i1>, float*)
215+
216+
declare <vscale x 2 x i8> @llvm.aarch64.sve.ldff1.nxv2i8(<vscale x 2 x i1>, i8*)
217+
declare <vscale x 2 x i16> @llvm.aarch64.sve.ldff1.nxv2i16(<vscale x 2 x i1>, i16*)
218+
declare <vscale x 2 x i32> @llvm.aarch64.sve.ldff1.nxv2i32(<vscale x 2 x i1>, i32*)
219+
declare <vscale x 2 x i64> @llvm.aarch64.sve.ldff1.nxv2i64(<vscale x 2 x i1>, i64*)
220+
declare <vscale x 2 x double> @llvm.aarch64.sve.ldff1.nxv2f64(<vscale x 2 x i1>, double*)

0 commit comments

Comments
 (0)