Skip to content

[AArch64] Add SVE2.1 intrinsics for indexed quadword gather loads and scatter stores #70476

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions clang/include/clang/Basic/arm_sve.td
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,12 @@ let TargetGuard = "sve2p1" in {
defm SVLD2Q_VNUM : StructLoad<"svld2q_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2q_sret">;
defm SVLD3Q_VNUM : StructLoad<"svld3q_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3q_sret">;
defm SVLD4Q_VNUM : StructLoad<"svld4q_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4q_sret">;

// Load quadwords (scalar base + vector index)
def SVLD1Q_GATHER_INDICES_U : MInst<"svld1q_gather_[{3}]index[_{d}]", "dPcg", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_index">;

// Load quadwords (vector base + scalar index)
def SVLD1Q_GATHER_INDEX_S : MInst<"svld1q_gather[_{2}base]_index_{d}", "dPgl", "sUsiUilUlbhfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ld1q_gather_scalar_offset">;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -464,6 +470,12 @@ let TargetGuard = "sve2p1" in {
defm SVST2Q_VNUM : StructStore<"svst2q_vnum[_{d}]", "vPcl2", "aarch64_sve_st2q">;
defm SVST3Q_VNUM : StructStore<"svst3q_vnum[_{d}]", "vPcl3", "aarch64_sve_st3q">;
defm SVST4Q_VNUM : StructStore<"svst4q_vnum[_{d}]", "vPcl4", "aarch64_sve_st4q">;

// Scatter store quadwords (scalar base + vector index)
def SVST1Q_SCATTER_INDICES_U : MInst<"svst1q_scatter_[{3}]index[_{d}]", "vPpgd", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_index">;

// Scatter store quadwords (vector base + scalar index)
def SVST1Q_SCATTER_INDEX_S : MInst<"svst1q_scatter[_{2}base]_index[_{d}]", "vPgld", "sUsiUilUlbhfd", [IsScatterStore], MemEltTyDefault, "aarch64_sve_st1q_scatter_scalar_offset">;
}

////////////////////////////////////////////////////////////////////////////////
Expand Down
340 changes: 340 additions & 0 deletions clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_loads.c

Large diffs are not rendered by default.

340 changes: 340 additions & 0 deletions clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_store.c

Large diffs are not rendered by default.

25 changes: 25 additions & 0 deletions llvm/include/llvm/IR/IntrinsicsAArch64.td
Original file line number Diff line number Diff line change
Expand Up @@ -1466,6 +1466,15 @@ class AdvSIMD_GatherLoadQ_VS_Intrinsic
],
[IntrReadMem]>;

class AdvSIMD_GatherLoadQ_SV_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[
llvm_nxv1i1_ty,
llvm_ptr_ty,
llvm_nxv2i64_ty
],
[IntrReadMem, IntrArgMemOnly]>;

class AdvSIMD_GatherLoad_VS_WriteFFR_Intrinsic
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[
Expand Down Expand Up @@ -1514,6 +1523,16 @@ class AdvSIMD_ScatterStoreQ_VS_Intrinsic
],
[IntrWriteMem]>;

class AdvSIMD_ScatterStoreQ_SV_Intrinsic
: DefaultAttrsIntrinsic<[],
[
llvm_anyvector_ty,
llvm_nxv1i1_ty,
llvm_ptr_ty,
llvm_nxv2i64_ty
],
[IntrWriteMem, IntrArgMemOnly]>;

class SVE_gather_prf_SV
: DefaultAttrsIntrinsic<[],
[
Expand Down Expand Up @@ -2144,6 +2163,9 @@ def int_aarch64_sve_ld1_gather_uxtw : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsi
def int_aarch64_sve_ld1_gather_sxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;
def int_aarch64_sve_ld1_gather_uxtw_index : AdvSIMD_GatherLoad_SV_32b_Offsets_Intrinsic;

// 128-bit loads, scaled offsets (indices)
def int_aarch64_sve_ld1q_gather_index : AdvSIMD_GatherLoadQ_SV_Intrinsic;

//
// Gather loads: vector base + scalar offset
//
Expand Down Expand Up @@ -2222,6 +2244,9 @@ def int_aarch64_sve_st1_scatter_sxtw_index
def int_aarch64_sve_st1_scatter_uxtw_index
: AdvSIMD_ScatterStore_SV_32b_Offsets_Intrinsic;

// 128-bit stores, scaled offsets (indices)
def int_aarch64_sve_st1q_scatter_index : AdvSIMD_ScatterStoreQ_SV_Intrinsic;

//
// Scatter stores: vector base + scalar offset
//
Expand Down
38 changes: 28 additions & 10 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2579,6 +2579,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
Expand All @@ -2604,6 +2605,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
MAKE_CASE(AArch64ISD::SST1Q_PRED)
MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
MAKE_CASE(AArch64ISD::ST1_PRED)
MAKE_CASE(AArch64ISD::SST1_PRED)
MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
Expand Down Expand Up @@ -22761,10 +22763,11 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();

// For FPs, ACLE only supports _packed_ single and double precision types.
// SST1Q_PRED is the ST1Q for sve2p1 and should allow all sizes
// SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
if (SrcElVT.isFloatingPoint())
if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
(Opcode != AArch64ISD::SST1Q_PRED ||
((Opcode != AArch64ISD::SST1Q_PRED &&
Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
return SDValue();

Expand All @@ -22782,14 +22785,19 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
Offset =
getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
Opcode = AArch64ISD::SSTNT1_PRED;
} else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
Offset =
getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
Opcode = AArch64ISD::SST1Q_PRED;
}

// In the case of non-temporal gather loads there's only one SVE instruction
// per data-size: "scalar + vector", i.e.
// * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
// Since we do have intrinsics that allow the arguments to be in a different
// order, we may need to swap them to match the spec.
if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
Offset.getValueType().isVector())
std::swap(Base, Offset);

// SST1_IMM requires that the offset is an immediate that is:
Expand Down Expand Up @@ -22872,21 +22880,26 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
// vector of offsets (that fits into one register)
SDValue Offset = N->getOperand(4);

// For "scalar + vector of indices", just scale the indices. This only
// applies to non-temporal gathers because there's no instruction that takes
// indicies.
// For "scalar + vector of indices", scale the indices to obtain unscaled
// offsets. This applies to non-temporal and quadword gathers, which do not
// have an addressing mode with scaled offset.
if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
RetVT.getScalarSizeInBits());
Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
} else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
RetVT.getScalarSizeInBits());
Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
}

// In the case of non-temporal gather loads there's only one SVE instruction
// per data-size: "scalar + vector", i.e.
// * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
// In the case of non-temporal gather loads and quadword gather loads there's
// only one addressing mode : "vector + scalar", e.g.
// ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
// Since we do have intrinsics that allow the arguments to be in a different
// order, we may need to swap them to match the spec.
if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
Offset.getValueType().isVector())
std::swap(Base, Offset);

Expand Down Expand Up @@ -23736,6 +23749,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1q_gather_index:
return performGatherLoadCombine(N, DAG,
AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
case Intrinsic::aarch64_sve_ld1_gather_index:
return performGatherLoadCombine(N, DAG,
AArch64ISD::GLD1_SCALED_MERGE_ZERO);
Expand Down Expand Up @@ -23781,6 +23797,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
case Intrinsic::aarch64_sve_st1q_scatter_index:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
case Intrinsic::aarch64_sve_st1_scatter:
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
case Intrinsic::aarch64_sve_st1_scatter_index:
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,7 @@ enum NodeType : unsigned {
GLD1_SXTW_SCALED_MERGE_ZERO,
GLD1_IMM_MERGE_ZERO,
GLD1Q_MERGE_ZERO,
GLD1Q_INDEX_MERGE_ZERO,

// Signed gather loads
GLD1S_MERGE_ZERO,
Expand Down Expand Up @@ -421,6 +422,7 @@ enum NodeType : unsigned {
SST1_SXTW_SCALED_PRED,
SST1_IMM_PRED,
SST1Q_PRED,
SST1Q_INDEX_PRED,

// Non-temporal scatter store
SSTNT1_PRED,
Expand Down
Loading