Skip to content

Commit f406b28

Browse files
authored
[AArch64][SVE] Fold integer lane extract and store to FPR store (#129756)
This helps avoid pointless fmovs to GPRs, which may be slow, especially in streaming mode.
1 parent 4ad0aa7 commit f406b28

25 files changed

+557
-123
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 62 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23938,6 +23938,20 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
2393823938
return Chain;
2393923939
}
2394023940

23941+
static unsigned getFPSubregForVT(EVT VT) {
23942+
assert(VT.isSimple() && "Expected simple VT");
23943+
switch (VT.getSimpleVT().SimpleTy) {
23944+
case MVT::f16:
23945+
return AArch64::hsub;
23946+
case MVT::f32:
23947+
return AArch64::ssub;
23948+
case MVT::f64:
23949+
return AArch64::dsub;
23950+
default:
23951+
llvm_unreachable("Unexpected VT!");
23952+
}
23953+
}
23954+
2394123955
static SDValue performSTORECombine(SDNode *N,
2394223956
TargetLowering::DAGCombinerInfo &DCI,
2394323957
SelectionDAG &DAG,
@@ -23998,15 +24012,58 @@ static SDValue performSTORECombine(SDNode *N,
2399824012
if (SDValue Store = combineBoolVectorAndTruncateStore(DAG, ST))
2399924013
return Store;
2400024014

24001-
if (ST->isTruncatingStore()) {
24002-
EVT StoreVT = ST->getMemoryVT();
24003-
if (!isHalvingTruncateOfLegalScalableType(ValueVT, StoreVT))
24004-
return SDValue();
24015+
if (ST->isTruncatingStore() &&
24016+
isHalvingTruncateOfLegalScalableType(ValueVT, MemVT)) {
2400524017
if (SDValue Rshrnb =
2400624018
trySimplifySrlAddToRshrnb(ST->getOperand(1), DAG, Subtarget)) {
2400724019
return DAG.getTruncStore(ST->getChain(), ST, Rshrnb, ST->getBasePtr(),
24008-
StoreVT, ST->getMemOperand());
24020+
MemVT, ST->getMemOperand());
24021+
}
24022+
}
24023+
24024+
// This is an integer vector_extract_elt followed by a (possibly truncating)
24025+
// store. We may be able to replace this with a store of an FP subregister.
24026+
if (DCI.isAfterLegalizeDAG() && ST->isUnindexed() &&
24027+
Value.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
24028+
24029+
SDValue Vector = Value.getOperand(0);
24030+
SDValue ExtIdx = Value.getOperand(1);
24031+
EVT VectorVT = Vector.getValueType();
24032+
EVT ElemVT = VectorVT.getVectorElementType();
24033+
if (!ValueVT.isInteger() || ElemVT == MVT::i8 || MemVT == MVT::i8)
24034+
return SDValue();
24035+
if (ValueVT != MemVT && !ST->isTruncatingStore())
24036+
return SDValue();
24037+
24038+
// Heuristic: If there are other users of integer scalars extracted from
24039+
// this vector that won't fold into the store -- abandon folding. Applying
24040+
// this fold may extend the vector lifetime and disrupt paired stores.
24041+
for (const auto &Use : Vector->uses()) {
24042+
if (Use.getResNo() != Vector.getResNo())
24043+
continue;
24044+
const SDNode *User = Use.getUser();
24045+
if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24046+
(!User->hasOneUse() ||
24047+
(*User->user_begin())->getOpcode() != ISD::STORE))
24048+
return SDValue();
2400924049
}
24050+
24051+
EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits());
24052+
EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT);
24053+
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector);
24054+
SDValue Ext =
24055+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, FPElemVT, Cast, ExtIdx);
24056+
24057+
EVT FPMemVT = EVT::getFloatingPointVT(MemVT.getSizeInBits());
24058+
if (ST->isTruncatingStore() && FPMemVT != FPElemVT) {
24059+
SDValue Trunc = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24060+
FPMemVT, Ext);
24061+
return DAG.getStore(ST->getChain(), DL, Trunc, ST->getBasePtr(),
24062+
ST->getMemOperand());
24063+
}
24064+
24065+
return DAG.getStore(ST->getChain(), DL, Ext, ST->getBasePtr(),
24066+
ST->getMemOperand());
2401024067
}
2401124068

2401224069
return SDValue();

llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,10 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
7070
; CHECK-NEXT: movi.2d v0, #0000000000000000
7171
; CHECK-NEXT: movi.2d v2, #0000000000000000
7272
; CHECK-NEXT: str wzr, [x0, #88]
73+
; CHECK-NEXT: str xzr, [x0, #80]
7374
; CHECK-NEXT: uaddlv.8h s1, v0
7475
; CHECK-NEXT: stp q0, q0, [x0, #16]
7576
; CHECK-NEXT: stp q0, q0, [x0, #48]
76-
; CHECK-NEXT: str d0, [x0, #80]
7777
; CHECK-NEXT: mov.s v2[0], v1[0]
7878
; CHECK-NEXT: ucvtf.4s v1, v2
7979
; CHECK-NEXT: str q1, [x0]
@@ -146,11 +146,10 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
146146
; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
147147
; CHECK: ; %bb.0: ; %entry
148148
; CHECK-NEXT: movi.2d v0, #0000000000000000
149+
; CHECK-NEXT: str xzr, [x0, #16]
149150
; CHECK-NEXT: uaddlv.4s d1, v0
150151
; CHECK-NEXT: mov.d v0[0], v1[0]
151-
; CHECK-NEXT: movi.2d v1, #0000000000000000
152152
; CHECK-NEXT: ucvtf.2d v0, v0
153-
; CHECK-NEXT: str d1, [x0, #16]
154153
; CHECK-NEXT: fcvtn v0.2s, v0.2d
155154
; CHECK-NEXT: str q0, [x0]
156155
; CHECK-NEXT: ret

0 commit comments

Comments
 (0)