Skip to content

Commit 28f62d7

Browse files
[AArch64] Add SVE2.1 intrinsics for indexed quadword gather loads and scatter stores (#70476)
This patch adds the quadword gather load intrinsics of the form sv<type>_t svld1q_gather_u64index_<typ>(svbool_t, const <type>_t *, svuint64_t); sv<type>_t svld1q_gather_u64base_index_<typ>(svbool_t, svuint64_t, int64_t); and the quadword scatter store intrinsics of the form void svst1q_scatter_u64index_<typ>(svbool_t, <type>_t *, svuint64_t, sv<type>_t); void svst1q_scatter_u64base_index_<typ>(svbool, svuint64_t, int64_t, sv<type>_t); ACLE spec: ARM-software/acle#257
1 parent c0a1fcd commit 28f62d7

File tree

8 files changed

+1244
-10
lines changed

8 files changed

+1244
-10
lines changed

clang/include/clang/Basic/arm_sve.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,12 @@ let TargetGuard = "sve2p1" in {
319319
defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">;
320320
defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">;
321321
defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;
322+
323+
// Load quadwords (scalar base + vector index)
324+
def SVLD1Q_GATHER_INDICES_U : MInst<"svld1q_gather_[{3}]index[_{d}]", "dPcg", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_index">;
325+
326+
// Load quadwords (vector base + scalar index)
327+
def SVLD1Q_GATHER_INDEX_S : MInst<"svld1q_gather[_{2}base]_index_{d}", "dPgl", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
322328
}
323329

324330
////////////////////////////////////////////////////////////////////////////////
@@ -464,6 +470,12 @@ let TargetGuard = "sve2p1" in {
464470
defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">;
465471
defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">;
466472
defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;
473+
474+
// Scatter store quadwords (scalar base + vector index)
475+
def SVST1Q_SCATTER_INDICES_U : MInst<"svst1q_scatter_[{3}]index[_{d}]", "vPpgd", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_index">;
476+
477+
// Scatter store quadwords (vector base + scalar index)
478+
def SVST1Q_SCATTER_INDEX_S : MInst<"svst1q_scatter[_{2}base]_index[_{d}]", "vPgld", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
467479
}
468480

469481
////////////////////////////////////////////////////////////////////////////////

clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c

Lines changed: 340 additions & 0 deletions
Large diffs are not rendered by default.

clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c

Lines changed: 340 additions & 0 deletions
Large diffs are not rendered by default.

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1466,6 +1466,15 @@ class AdvSIMD_GatherLoadQ_VS_Intrinsic
14661466
],
14671467
[IntrReadMem]>;
14681468

1469+
class AdvSIMD_GatherLoadQ_SV_Intrinsic
1470+
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
1471+
[
1472+
llvm_nxv1i1_ty,
1473+
llvm_ptr_ty,
1474+
llvm_nxv2i64_ty
1475+
],
1476+
[IntrReadMem, IntrArgMemOnly]>;
1477+
14691478
class AdvSIMD_GatherLoad_VS_WriteFFR_Intrinsic
14701479
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
14711480
[
@@ -1514,6 +1523,16 @@ class AdvSIMD_ScatterStoreQ_VS_Intrinsic
15141523
],
15151524
[IntrWriteMem]>;
15161525

1526+
class AdvSIMD_ScatterStoreQ_SV_Intrinsic
1527+
: DefaultAttrsIntrinsic<[],
1528+
[
1529+
llvm_anyvector_ty,
1530+
llvm_nxv1i1_ty,
1531+
llvm_ptr_ty,
1532+
llvm_nxv2i64_ty
1533+
],
1534+
[IntrWriteMem, IntrArgMemOnly]>;
1535+
15171536
class SVE_gather_prf_SV
15181537
: DefaultAttrsIntrinsic<[],
15191538
[
@@ -2144,6 +2163,9 @@ def int_aarch64_sve_ld1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsi
21442163
def int_aarch64_sve_ld1_gather_sxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
21452164
def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
21462165

2166+
// 128-bit loads, scaled offsets (indices)
2167+
def int_aarch64_sve_ld1q_gather_index : AdvSIMD_GatherLoadQ_SV_Intrinsic;
2168+
21472169
//
21482170
// Gather loads: vector base + scalar offset
21492171
//
@@ -2222,6 +2244,9 @@ def int_aarch64_sve_st1_scatter_sxtw_index
22222244
def int_aarch64_sve_st1_scatter_uxtw_index
22232245
: AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic;
22242246

2247+
// 128-bit stores, scaled offsets (indices)
2248+
def int_aarch64_sve_st1q_scatter_index : AdvSIMD_ScatterStoreQ_SV_Intrinsic;
2249+
22252250
//
22262251
// Scatter stores: vector base + scalar offset
22272252
//

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2579,6 +2579,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
25792579
MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
25802580
MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
25812581
MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
2582+
MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
25822583
MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
25832584
MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
25842585
MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
@@ -2604,6 +2605,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
26042605
MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
26052606
MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
26062607
MAKE_CASE(AArch64ISD::SST1Q_PRED)
2608+
MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
26072609
MAKE_CASE(AArch64ISD::ST1_PRED)
26082610
MAKE_CASE(AArch64ISD::SST1_PRED)
26092611
MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
@@ -22761,10 +22763,11 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
2276122763
return SDValue();
2276222764

2276322765
// For FPs, ACLE only supports _packed_ single and double precision types.
22764-
// SST1Q_PRED is the ST1Q for sve2p1 and should allow all sizes
22766+
// SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
2276522767
if (SrcElVT.isFloatingPoint())
2276622768
if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
22767-
(Opcode != AArch64ISD::SST1Q_PRED ||
22769+
((Opcode != AArch64ISD::SST1Q_PRED &&
22770+
Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
2276822771
((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
2276922772
return SDValue();
2277022773

@@ -22782,14 +22785,19 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
2278222785
Offset =
2278322786
getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
2278422787
Opcode = AArch64ISD::SSTNT1_PRED;
22788+
} else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
22789+
Offset =
22790+
getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
22791+
Opcode = AArch64ISD::SST1Q_PRED;
2278522792
}
2278622793

2278722794
// In the case of non-temporal gather loads there's only one SVE instruction
2278822795
// per data-size: "scalar + vector", i.e.
2278922796
// * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
2279022797
// Since we do have intrinsics that allow the arguments to be in a different
2279122798
// order, we may need to swap them to match the spec.
22792-
if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
22799+
if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
22800+
Offset.getValueType().isVector())
2279322801
std::swap(Base, Offset);
2279422802

2279522803
// SST1_IMM requires that the offset is an immediate that is:
@@ -22872,21 +22880,26 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
2287222880
// vector of offsets (that fits into one register)
2287322881
SDValue Offset = N->getOperand(4);
2287422882

22875-
// For "scalar + vector of indices", just scale the indices. This only
22876-
// applies to non-temporal gathers because there's no instruction that takes
22877-
// indicies.
22883+
// For "scalar + vector of indices", scale the indices to obtain unscaled
22884+
// offsets. This applies to non-temporal and quadword gathers, which do not
22885+
// have an addressing mode with scaled offset.
2287822886
if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
2287922887
Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
2288022888
RetVT.getScalarSizeInBits());
2288122889
Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
22890+
} else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
22891+
Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
22892+
RetVT.getScalarSizeInBits());
22893+
Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
2288222894
}
2288322895

22884-
// In the case of non-temporal gather loads there's only one SVE instruction
22885-
// per data-size: "scalar + vector", i.e.
22886-
// * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
22896+
// In the case of non-temporal gather loads and quadword gather loads there's
22897+
// only one addressing mode : "vector + scalar", e.g.
22898+
// ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
2288722899
// Since we do have intrinsics that allow the arguments to be in a different
2288822900
// order, we may need to swap them to match the spec.
22889-
if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
22901+
if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
22902+
Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
2289022903
Offset.getValueType().isVector())
2289122904
std::swap(Base, Offset);
2289222905

@@ -23736,6 +23749,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2373623749
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
2373723750
case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2373823751
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
23752+
case Intrinsic::aarch64_sve_ld1q_gather_index:
23753+
return performGatherLoadCombine(N, DAG,
23754+
AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
2373923755
case Intrinsic::aarch64_sve_ld1_gather_index:
2374023756
return performGatherLoadCombine(N, DAG,
2374123757
AArch64ISD::GLD1_SCALED_MERGE_ZERO);
@@ -23781,6 +23797,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
2378123797
AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
2378223798
case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2378323799
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
23800+
case Intrinsic::aarch64_sve_st1q_scatter_index:
23801+
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
2378423802
case Intrinsic::aarch64_sve_st1_scatter:
2378523803
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
2378623804
case Intrinsic::aarch64_sve_st1_scatter_index:

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,7 @@ enum NodeType : unsigned {
376376
GLD1_SXTW_SCALED_MERGE_ZERO,
377377
GLD1_IMM_MERGE_ZERO,
378378
GLD1Q_MERGE_ZERO,
379+
GLD1Q_INDEX_MERGE_ZERO,
379380

380381
// Signed gather loads
381382
GLD1S_MERGE_ZERO,
@@ -421,6 +422,7 @@ enum NodeType : unsigned {
421422
SST1_SXTW_SCALED_PRED,
422423
SST1_IMM_PRED,
423424
SST1Q_PRED,
425+
SST1Q_INDEX_PRED,
424426

425427
// Non-temporal scatter store
426428
SSTNT1_PRED,

0 commit comments

Comments
 (0)