@@ -2579,6 +2579,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2579
2579
MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
2580
2580
MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
2581
2581
MAKE_CASE(AArch64ISD::GLD1Q_MERGE_ZERO)
2582
+ MAKE_CASE(AArch64ISD::GLD1Q_INDEX_MERGE_ZERO)
2582
2583
MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
2583
2584
MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
2584
2585
MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
@@ -2604,6 +2605,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2604
2605
MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
2605
2606
MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
2606
2607
MAKE_CASE(AArch64ISD::SST1Q_PRED)
2608
+ MAKE_CASE(AArch64ISD::SST1Q_INDEX_PRED)
2607
2609
MAKE_CASE(AArch64ISD::ST1_PRED)
2608
2610
MAKE_CASE(AArch64ISD::SST1_PRED)
2609
2611
MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
@@ -22761,10 +22763,11 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
22761
22763
return SDValue();
22762
22764
22763
22765
// For FPs, ACLE only supports _packed_ single and double precision types.
22764
- // SST1Q_PRED is the ST1Q for sve2p1 and should allow all sizes
22766
+ // SST1Q_[INDEX_]PRED is the ST1Q for sve2p1 and should allow all sizes.
22765
22767
if (SrcElVT.isFloatingPoint())
22766
22768
if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64) &&
22767
- (Opcode != AArch64ISD::SST1Q_PRED ||
22769
+ ((Opcode != AArch64ISD::SST1Q_PRED &&
22770
+ Opcode != AArch64ISD::SST1Q_INDEX_PRED) ||
22768
22771
((SrcVT != MVT::nxv8f16) && (SrcVT != MVT::nxv8bf16))))
22769
22772
return SDValue();
22770
22773
@@ -22782,14 +22785,19 @@ static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
22782
22785
Offset =
22783
22786
getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
22784
22787
Opcode = AArch64ISD::SSTNT1_PRED;
22788
+ } else if (Opcode == AArch64ISD::SST1Q_INDEX_PRED) {
22789
+ Offset =
22790
+ getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
22791
+ Opcode = AArch64ISD::SST1Q_PRED;
22785
22792
}
22786
22793
22787
22794
// In the case of non-temporal gather loads there's only one SVE instruction
22788
22795
// per data-size: "scalar + vector", i.e.
22789
22796
// * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
22790
22797
// Since we do have intrinsics that allow the arguments to be in a different
22791
22798
// order, we may need to swap them to match the spec.
22792
- if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
22799
+ if ((Opcode == AArch64ISD::SSTNT1_PRED || Opcode == AArch64ISD::SST1Q_PRED) &&
22800
+ Offset.getValueType().isVector())
22793
22801
std::swap(Base, Offset);
22794
22802
22795
22803
// SST1_IMM requires that the offset is an immediate that is:
@@ -22872,21 +22880,26 @@ static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
22872
22880
// vector of offsets (that fits into one register)
22873
22881
SDValue Offset = N->getOperand(4);
22874
22882
22875
- // For "scalar + vector of indices", just scale the indices. This only
22876
- // applies to non-temporal gathers because there's no instruction that takes
22877
- // indicies .
22883
+ // For "scalar + vector of indices", scale the indices to obtain unscaled
22884
+ // offsets. This applies to non-temporal and quadword gathers, which do not
22885
+ // have an addressing mode with scaled offset .
22878
22886
if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
22879
22887
Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
22880
22888
RetVT.getScalarSizeInBits());
22881
22889
Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
22890
+ } else if (Opcode == AArch64ISD::GLD1Q_INDEX_MERGE_ZERO) {
22891
+ Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
22892
+ RetVT.getScalarSizeInBits());
22893
+ Opcode = AArch64ISD::GLD1Q_MERGE_ZERO;
22882
22894
}
22883
22895
22884
- // In the case of non-temporal gather loads there's only one SVE instruction
22885
- // per data-size : "scalar + vector ", i.e .
22886
- // * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
22896
+ // In the case of non-temporal gather loads and quadword gather loads there's
22897
+ // only one addressing mode : "vector + scalar ", e.g .
22898
+ // ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
22887
22899
// Since we do have intrinsics that allow the arguments to be in a different
22888
22900
// order, we may need to swap them to match the spec.
22889
- if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
22901
+ if ((Opcode == AArch64ISD::GLDNT1_MERGE_ZERO ||
22902
+ Opcode == AArch64ISD::GLD1Q_MERGE_ZERO) &&
22890
22903
Offset.getValueType().isVector())
22891
22904
std::swap(Base, Offset);
22892
22905
@@ -23736,6 +23749,9 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
23736
23749
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
23737
23750
case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
23738
23751
return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1Q_MERGE_ZERO);
23752
+ case Intrinsic::aarch64_sve_ld1q_gather_index:
23753
+ return performGatherLoadCombine(N, DAG,
23754
+ AArch64ISD::GLD1Q_INDEX_MERGE_ZERO);
23739
23755
case Intrinsic::aarch64_sve_ld1_gather_index:
23740
23756
return performGatherLoadCombine(N, DAG,
23741
23757
AArch64ISD::GLD1_SCALED_MERGE_ZERO);
@@ -23781,6 +23797,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
23781
23797
AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
23782
23798
case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
23783
23799
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_PRED);
23800
+ case Intrinsic::aarch64_sve_st1q_scatter_index:
23801
+ return performScatterStoreCombine(N, DAG, AArch64ISD::SST1Q_INDEX_PRED);
23784
23802
case Intrinsic::aarch64_sve_st1_scatter:
23785
23803
return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
23786
23804
case Intrinsic::aarch64_sve_st1_scatter_index:
0 commit comments