@@ -20709,6 +20709,61 @@ static SDValue foldTruncStoreOfExt(SelectionDAG &DAG, SDNode *N) {
20709
20709
return SDValue();
20710
20710
}
20711
20711
20712
+ // A custom combine to lower load <3 x i8> as the more efficient sequence
20713
+ // below:
20714
+ // ldrb wX, [x0, #2]
20715
+ // ldrh wY, [x0]
20716
+ // orr wX, wY, wX, lsl #16
20717
+ // fmov s0, wX
20718
+ //
20719
+ // Note that an alternative sequence with even fewer (although usually more
20720
+ // complex/expensive) instructions would be:
20721
+ // ld1r.4h { v0 }, [x0], #2
20722
+ // ld1.b { v0 }[2], [x0]
20723
+ //
20724
+ // Generating this sequence unfortunately results in noticeably worse codegen
20725
+ // for code that extends the loaded v3i8, due to legalization breaking vector
20726
+ // shuffle detection in a way that is very difficult to work around.
20727
+ // TODO: Revisit once v3i8 legalization has been improved in general.
20728
+ static SDValue combineV3I8LoadExt(LoadSDNode *LD, SelectionDAG &DAG) {
20729
+ EVT MemVT = LD->getMemoryVT();
20730
+ if (MemVT != EVT::getVectorVT(*DAG.getContext(), MVT::i8, 3) ||
20731
+ LD->getOriginalAlign() >= 4)
20732
+ return SDValue();
20733
+
20734
+ SDLoc DL(LD);
20735
+ MachineFunction &MF = DAG.getMachineFunction();
20736
+ SDValue Chain = LD->getChain();
20737
+ SDValue BasePtr = LD->getBasePtr();
20738
+ MachineMemOperand *MMO = LD->getMemOperand();
20739
+ assert(LD->getOffset().isUndef() && "undef offset expected");
20740
+
20741
+ // Load 2 x i8, then 1 x i8.
20742
+ SDValue L16 = DAG.getLoad(MVT::i16, DL, Chain, BasePtr, MMO);
20743
+ TypeSize Offset2 = TypeSize::getFixed(2);
20744
+ SDValue L8 = DAG.getLoad(MVT::i8, DL, Chain,
20745
+ DAG.getMemBasePlusOffset(BasePtr, Offset2, DL),
20746
+ MF.getMachineMemOperand(MMO, 2, 1));
20747
+
20748
+ // Extend to i32.
20749
+ SDValue Ext16 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L16);
20750
+ SDValue Ext8 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, L8);
20751
+
20752
+ // Pack 2 x i8 and 1 x i8 in an i32 and convert to v4i8.
20753
+ SDValue Shl = DAG.getNode(ISD::SHL, DL, MVT::i32, Ext8,
20754
+ DAG.getConstant(16, DL, MVT::i32));
20755
+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i32, Ext16, Shl);
20756
+ SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v4i8, Or);
20757
+
20758
+ // Extract v3i8 again.
20759
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MemVT, Cast,
20760
+ DAG.getConstant(0, DL, MVT::i64));
20761
+ SDValue TokenFactor = DAG.getNode(
20762
+ ISD::TokenFactor, DL, MVT::Other,
20763
+ {SDValue(cast<SDNode>(L16), 1), SDValue(cast<SDNode>(L8), 1)});
20764
+ return DAG.getMergeValues({Extract, TokenFactor}, DL);
20765
+ }
20766
+
20712
20767
// Perform TBI simplification if supported by the target and try to break up
20713
20768
// nontemporal loads larger than 256-bits loads for odd types so LDNPQ 256-bit
20714
20769
// load instructions can be selected.
@@ -20720,10 +20775,16 @@ static SDValue performLOADCombine(SDNode *N,
20720
20775
performTBISimplification(N->getOperand(1), DCI, DAG);
20721
20776
20722
20777
LoadSDNode *LD = cast<LoadSDNode>(N);
20723
- EVT MemVT = LD->getMemoryVT();
20724
- if (LD->isVolatile() || !LD->isNonTemporal() || !Subtarget->isLittleEndian())
20778
+ if (LD->isVolatile() || !Subtarget->isLittleEndian())
20725
20779
return SDValue(N, 0);
20726
20780
20781
+ if (SDValue Res = combineV3I8LoadExt(LD, DAG))
20782
+ return Res;
20783
+
20784
+ if (!LD->isNonTemporal())
20785
+ return SDValue(N, 0);
20786
+
20787
+ EVT MemVT = LD->getMemoryVT();
20727
20788
if (MemVT.isScalableVector() || MemVT.getSizeInBits() <= 256 ||
20728
20789
MemVT.getSizeInBits() % 256 == 0 ||
20729
20790
256 % MemVT.getScalarSizeInBits() != 0)
0 commit comments