Skip to content

Commit 4ddc200

Browse files
committed
[AArch64][SVE] Lower unpredicated loads/stores as LDR/STR with sve-vector-bits=128.
Given the code below: ```cpp svuint8_t foo(uint8_t *x) { return svld1(svptrue_b8(), x); } ``` When compiled with -msve-vector-bits=128 (or vscale_range(1, 1)), we currently generate: ```gas foo: ptrue p0.b ld1b { z0.b }, p0/z, [x0] ret ``` Whereas (on little-endian) we could instead be using LDR as follows: ```gas foo: ldr q0, [x0] ret ``` Besides avoiding the predicate dependency, the above form enables further optimisations such as LDP folds. Likewise for stores.
1 parent c69b267 commit 4ddc200

File tree

2 files changed

+95
-356
lines changed

2 files changed

+95
-356
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23654,6 +23654,28 @@ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
2365423654
return DAG.getMergeValues({Extract, TokenFactor}, DL);
2365523655
}
2365623656

23657+
// Replace packed scalable loads with fixed loads when vscale_range(1, 1).
23658+
// This enables further optimisations such as LDP folds.
23659+
static SDValue combineVScale1Load(LoadSDNode *LD, SelectionDAG &DAG,
23660+
TargetLowering::DAGCombinerInfo &DCI,
23661+
const AArch64Subtarget *Subtarget) {
23662+
EVT MemVT = LD->getMemoryVT();
23663+
if (!DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
23664+
!MemVT.isScalableVector() || LD->getExtensionType() != ISD::NON_EXTLOAD ||
23665+
MemVT.getSizeInBits().getKnownMinValue() != 128 ||
23666+
Subtarget->getMaxSVEVectorSizeInBits() != 128)
23667+
return SDValue();
23668+
23669+
SDLoc DL(LD);
23670+
MVT NewVT = MVT::getVectorVT(MemVT.getVectorElementType().getSimpleVT(),
23671+
MemVT.getVectorMinNumElements());
23672+
SDValue NewLoad = DAG.getLoad(
23673+
NewVT, DL, LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
23674+
LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), LD->getAAInfo());
23675+
SDValue Insert = convertToScalableVector(DAG, MemVT, NewLoad);
23676+
return DAG.getMergeValues({Insert, SDValue(cast<SDNode>(NewLoad), 1)}, DL);
23677+
}
23678+
2365723679
// Perform TBI simplification if supported by the target and try to break up
2365823680
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
2365923681
// load instructions can be selected.
@@ -23691,6 +23713,9 @@ static SDValue performLOADCombine(SDNode *N,
2369123713
if (SDValue Res = combineV3I8LoadExt(LD, DAG))
2369223714
return Res;
2369323715

23716+
if (SDValue Res = combineVScale1Load(LD, DAG, DCI, Subtarget))
23717+
return Res;
23718+
2369423719
if (!LD->isNonTemporal())
2369523720
return SDValue(N, 0);
2369623721

@@ -23949,6 +23974,29 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
2394923974
return Chain;
2395023975
}
2395123976

23977+
// Replace packed scalable stores with fixed stores when vscale_range(1, 1).
23978+
static SDValue combineVScale1Store(StoreSDNode *ST, SelectionDAG &DAG,
23979+
TargetLowering::DAGCombinerInfo &DCI,
23980+
const AArch64Subtarget *Subtarget) {
23981+
SDValue Value = ST->getValue();
23982+
EVT ValueVT = Value.getValueType();
23983+
if (ST->isVolatile() || !Subtarget->isLittleEndian() ||
23984+
!DCI.isBeforeLegalize() || !Subtarget->hasNEON() ||
23985+
!ValueVT.isScalableVector() || ST->isTruncatingStore() ||
23986+
ValueVT.getSizeInBits().getKnownMinValue() != 128 ||
23987+
Subtarget->getMaxSVEVectorSizeInBits() != 128)
23988+
return SDValue();
23989+
23990+
SDLoc DL(ST);
23991+
MVT NewVT = MVT::getVectorVT(ValueVT.getVectorElementType().getSimpleVT(),
23992+
ValueVT.getVectorMinNumElements());
23993+
SDValue NewValue = convertFromScalableVector(DAG, NewVT, Value);
23994+
SDValue NewStore = DAG.getStore(
23995+
ST->getChain(), DL, NewValue, ST->getBasePtr(), ST->getPointerInfo(),
23996+
ST->getOriginalAlign(), ST->getMemOperand()->getFlags(), ST->getAAInfo());
23997+
return NewStore;
23998+
}
23999+
2395224000
static unsigned getFPSubregForVT(EVT VT) {
2395324001
assert(VT.isSimple() && "Expected simple VT");
2395424002
switch (VT.getSimpleVT().SimpleTy) {
@@ -23997,6 +24045,9 @@ static SDValue performSTORECombine(SDNode *N,
2399724045
if (SDValue Res = combineI8TruncStore(ST, DAG, Subtarget))
2399824046
return Res;
2399924047

24048+
if (SDValue Res = combineVScale1Store(ST, DAG, DCI, Subtarget))
24049+
return Res;
24050+
2400024051
// If this is an FP_ROUND followed by a store, fold this into a truncating
2400124052
// store. We can do this even if this is already a truncstore.
2400224053
// We purposefully don't care about legality of the nodes here as we know

0 commit comments

Comments
 (0)