@@ -23654,6 +23654,28 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
23654
23654
return DAG.getMergeValues({Extract, TokenFactor}, DL);
23655
23655
}
23656
23656
23657
+ // Replace packed scalable loads with fixed loads when vscale_range(1, 1).
23658
+ // This enables further optimisations such as LDP folds.
23659
+ static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
23660
+ TargetLowering::DAGCombinerInfo &DCI,
23661
+ const AArch64Subtarget *Subtarget) {
23662
+ EVT MemVT = LD->getMemoryVT();
23663
+ if (!DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
23664
+ !MemVT.isScalableVector() || LD->getExtensionType() != ISD::NON_EXTLOAD ||
23665
+ MemVT.getSizeInBits().getKnownMinValue() != 128 ||
23666
+ Subtarget->getMaxSVEVectorSizeInBits() != 128)
23667
+ return SDValue();
23668
+
23669
+ SDLoc DL(LD);
23670
+ MVT NewVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
23671
+ MemVT.getVectorMinNumElements());
23672
+ SDValue NewLoad = DAG.getLoad(
23673
+ NewVT, DL, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
23674
+ LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
23675
+ SDValue Insert = convertToScalableVector(DAG, MemVT, NewLoad);
23676
+ return DAG.getMergeValues({Insert, SDValue(cast<SDNode>(NewLoad), 1)}, DL);
23677
+ }
23678
+
23657
23679
// Perform TBI simplification if supported by the target and try to break up
23658
23680
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
23659
23681
// load instructions can be selected.
@@ -23691,6 +23713,9 @@ static SDValue performLOADCombine(SDNode *N,
23691
23713
if (SDValue Res = combineV3I8LoadExt(LD, DAG))
23692
23714
return Res;
23693
23715
23716
+ if (SDValue Res = combineVScale1Load(LD, DAG, DCI, Subtarget))
23717
+ return Res;
23718
+
23694
23719
if (!LD->isNonTemporal())
23695
23720
return SDValue(N, 0);
23696
23721
@@ -23949,6 +23974,29 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
23949
23974
return Chain;
23950
23975
}
23951
23976
23977
+ // Replace packed scalable stores with fixed stores when vscale_range(1, 1).
23978
+ static SDValue combineVScale1Store(StoreSDNode *ST, SelectionDAG &DAG,
23979
+ TargetLowering::DAGCombinerInfo &DCI,
23980
+ const AArch64Subtarget *Subtarget) {
23981
+ SDValue Value = ST->getValue();
23982
+ EVT ValueVT = Value.getValueType();
23983
+ if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
23984
+ !DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
23985
+ !ValueVT.isScalableVector() || ST->isTruncatingStore() ||
23986
+ ValueVT.getSizeInBits().getKnownMinValue() != 128 ||
23987
+ Subtarget->getMaxSVEVectorSizeInBits() != 128)
23988
+ return SDValue();
23989
+
23990
+ SDLoc DL(ST);
23991
+ MVT NewVT = MVT::getVectorVT(ValueVT.getVectorElementType().getSimpleVT(),
23992
+ ValueVT.getVectorMinNumElements());
23993
+ SDValue NewValue = convertFromScalableVector(DAG, NewVT, Value);
23994
+ SDValue NewStore = DAG.getStore(
23995
+ ST->getChain(), DL, NewValue, ST->getBasePtr(), ST->getPointerInfo(),
23996
+ ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo());
23997
+ return NewStore;
23998
+ }
23999
+
23952
24000
static unsigned getFPSubregForVT(EVT VT) {
23953
24001
assert(VT.isSimple() && "Expected simple VT");
23954
24002
switch (VT.getSimpleVT().SimpleTy) {
@@ -23997,6 +24045,9 @@ static SDValue performSTORECombine(SDNode *N,
23997
24045
if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
23998
24046
return Res;
23999
24047
24048
+ if (SDValue Res = combineVScale1Store(ST, DAG, DCI, Subtarget))
24049
+ return Res;
24050
+
24000
24051
// If this is an FP_ROUND followed by a store, fold this into a truncating
24001
24052
// store. We can do this even if this is already a truncstore.
24002
24053
// We purposefully don't care about legality of the nodes here as we know
0 commit comments