Skip to content

Commit 0c7051c

Browse files
authored
Merge pull request #8078 from fhahn/vec3-load
Pick [AArch64] Add custom lowering for load <3 x i8>.
2 parents 6cb93c7 + c9a5ead commit 0c7051c

File tree

2 files changed

+282
-80
lines changed

2 files changed

+282
-80
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20709,6 +20709,61 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
2070920709
return SDValue();
2071020710
}
2071120711

20712+
// A custom combine to lower load <3 x i8> as the more efficient sequence
20713+
// below:
20714+
// ldrb wX, [x0, #2]
20715+
// ldrh wY, [x0]
20716+
// orr wX, wY, wX, lsl #16
20717+
// fmov s0, wX
20718+
//
20719+
// Note that an alternative sequence with even fewer (although usually more
20720+
// complex/expensive) instructions would be:
20721+
// ld1r.4h { v0 }, [x0], #2
20722+
// ld1.b { v0 }[2], [x0]
20723+
//
20724+
// Generating this sequence unfortunately results in noticeably worse codegen
20725+
// for code that extends the loaded v3i8, due to legalization breaking vector
20726+
// shuffle detection in a way that is very difficult to work around.
20727+
// TODO: Revisit once v3i8 legalization has been improved in general.
20728+
static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
20729+
EVT MemVT = LD->getMemoryVT();
20730+
if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
20731+
LD->getOriginalAlign() >= 4)
20732+
return SDValue();
20733+
20734+
SDLoc DL(LD);
20735+
MachineFunction &MF = DAG.getMachineFunction();
20736+
SDValue Chain = LD->getChain();
20737+
SDValue BasePtr = LD->getBasePtr();
20738+
MachineMemOperand *MMO = LD->getMemOperand();
20739+
assert(LD->getOffset().isUndef() && "undef offset expected");
20740+
20741+
// Load 2 x i8, then 1 x i8.
20742+
SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
20743+
TypeSize Offset2 = TypeSize::getFixed(2);
20744+
SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
20745+
DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
20746+
MF.getMachineMemOperand(MMO, 2, 1));
20747+
20748+
// Extend to i32.
20749+
SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
20750+
SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
20751+
20752+
// Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
20753+
SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
20754+
DAG.getConstant(16, DL, MVT::i32));
20755+
SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
20756+
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
20757+
20758+
// Extract v3i8 again.
20759+
SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
20760+
DAG.getConstant(0, DL, MVT::i64));
20761+
SDValue TokenFactor = DAG.getNode(
20762+
ISD::TokenFactor, DL, MVT::Other,
20763+
{SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
20764+
return DAG.getMergeValues({Extract, TokenFactor}, DL);
20765+
}
20766+
2071220767
// Perform TBI simplification if supported by the target and try to break up
2071320768
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
2071420769
// load instructions can be selected.
@@ -20720,10 +20775,16 @@ static SDValue performLOADCombine(SDNode *N,
2072020775
performTBISimplification(N->getOperand(1), DCI, DAG);
2072120776

2072220777
LoadSDNode *LD = cast<LoadSDNode>(N);
20723-
EVT MemVT = LD->getMemoryVT();
20724-
if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
20778+
if (LD->isVolatile() || !Subtarget->isLittleEndian())
2072520779
return SDValue(N, 0);
2072620780

20781+
if (SDValue Res = combineV3I8LoadExt(LD, DAG))
20782+
return Res;
20783+
20784+
if (!LD->isNonTemporal())
20785+
return SDValue(N, 0);
20786+
20787+
EVT MemVT = LD->getMemoryVT();
2072720788
if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
2072820789
MemVT.getSizeInBits() % 256 == 0 ||
2072920790
256 % MemVT.getScalarSizeInBits() != 0)

0 commit comments

Comments
 (0)