Skip to content

Commit e14277e

Browse files
committed
Generalize fold
1 parent e921a7f commit e14277e

35 files changed

+1014
-811
lines changed

llvm/lib/CodeGen/ValueTypes.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
223223
case MVT::x86mmx: return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
224224
case MVT::aarch64svcount:
225225
return TargetExtType::get(Context, "aarch64.svcount");
226+
case MVT::aarch64mfp8:
227+
return VectorType::get(IntegerType::get(Context, 8), ElementCount::getFixed(1));
226228
case MVT::x86amx: return Type::getX86_AMXTy(Context);
227229
case MVT::i64x8: return IntegerType::get(Context, 512);
228230
case MVT::amdgpuBufferFatPointer: return IntegerType::get(Context, 160);

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 47 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,9 +1394,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
13941394
}
13951395
}
13961396

1397-
// v1i64 -> v1i8 truncstore represents a bsub FPR8 store.
1398-
setTruncStoreAction(MVT::v1i64, MVT::v1i8, Legal);
1399-
14001397
for (auto Op :
14011398
{ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
14021399
ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
@@ -23981,6 +23978,8 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
2398123978
static unsigned getFPSubregForVT(EVT VT) {
2398223979
assert(VT.isSimple() && "Expected simple VT");
2398323980
switch (VT.getSimpleVT().SimpleTy) {
23981+
case MVT::aarch64mfp8:
23982+
return AArch64::bsub;
2398423983
case MVT::f16:
2398523984
return AArch64::hsub;
2398623985
case MVT::f32:
@@ -23992,22 +23991,6 @@ static unsigned getFPSubregForVT(EVT VT) {
2399223991
}
2399323992
}
2399423993

23995-
static EVT get64BitVector(EVT ElVT) {
23996-
assert(ElVT.isSimple() && "Expected simple VT");
23997-
switch (ElVT.getSimpleVT().SimpleTy) {
23998-
case MVT::i8:
23999-
return MVT::v8i8;
24000-
case MVT::i16:
24001-
return MVT::v4i16;
24002-
case MVT::i32:
24003-
return MVT::v2i32;
24004-
case MVT::i64:
24005-
return MVT::v1i64;
24006-
default:
24007-
llvm_unreachable("Unexpected VT!");
24008-
}
24009-
}
24010-
2401123994
static SDValue performSTORECombine(SDNode *N,
2401223995
TargetLowering::DAGCombinerInfo &DCI,
2401323996
SelectionDAG &DAG,
@@ -24086,72 +24069,63 @@ static SDValue performSTORECombine(SDNode *N,
2408624069
SDValue ExtIdx = Value.getOperand(1);
2408724070
EVT VectorVT = Vector.getValueType();
2408824071
EVT ElemVT = VectorVT.getVectorElementType();
24072+
2408924073
if (!ValueVT.isInteger())
2409024074
return SDValue();
2409124075
if (ValueVT != MemVT && !ST->isTruncatingStore())
2409224076
return SDValue();
2409324077

24094-
if (MemVT == MVT::i8) {
24095-
auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24096-
if (Subtarget->isNeonAvailable() &&
24097-
(VectorVT == MVT::v8i8 || VectorVT == MVT::v16i8) && ExtCst &&
24098-
!ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) {
24099-
// These can lower to st1.b, which is preferable if we're unlikely to
24100-
// fold the addressing into the store.
24101-
return SDValue();
24102-
}
24103-
24104-
// Lower as truncstore of v1i64 -> v1i8 (which can lower to a bsub store).
24105-
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
24106-
SDValue ExtVector;
24107-
EVT VecVT64 = get64BitVector(ElemVT);
24108-
if (ExtCst && ExtCst->isZero()) {
24109-
ExtVector =
24110-
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT64, Vector, Zero);
24111-
} else {
24112-
SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
24113-
Value.getValueType(), Vector, ExtIdx);
24114-
ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT64,
24115-
DAG.getUNDEF(VecVT64), Ext, Zero);
24116-
}
24117-
24118-
SDValue Cast = DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, ExtVector);
24119-
return DAG.getTruncStore(ST->getChain(), DL, Cast, ST->getBasePtr(),
24120-
MVT::v1i8, ST->getMemOperand());
24121-
}
24122-
24123-
// TODO: Handle storing i8s to wider types.
24124-
if (ElemVT == MVT::i8)
24078+
// This could generate an additional extract if the index is non-zero and
24079+
// the extracted value has multiple uses.
24080+
auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24081+
if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
2412524082
return SDValue();
2412624083

24127-
// Heuristic: If there are other users of integer scalars extracted from
24128-
// this vector that won't fold into the store -- abandon folding. Applying
24129-
// this fold may extend the vector lifetime and disrupt paired stores.
24130-
for (const auto &Use : Vector->uses()) {
24131-
if (Use.getResNo() != Vector.getResNo())
24132-
continue;
24133-
const SDNode *User = Use.getUser();
24134-
if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24135-
(!User->hasOneUse() ||
24136-
(*User->user_begin())->getOpcode() != ISD::STORE))
24137-
return SDValue();
24084+
if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24085+
(VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
24086+
!ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) {
24087+
// These can lower to st1, which is preferable if we're unlikely to fold
24088+
// the addressing into the store.
24089+
return SDValue();
2413824090
}
2413924091

24140-
EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits());
24141-
EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT);
24142-
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector);
24143-
SDValue Ext =
24144-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, FPElemVT, Cast, ExtIdx);
24092+
if (MemVT == MVT::i64 || MemVT == MVT::i32) {
24093+
// Heuristic: If there are other users of w/x integer scalars extracted
24094+
// from this vector that won't fold into the store -- abandon folding.
24095+
// Applying this fold may disrupt paired stores.
24096+
for (const auto &Use : Vector->uses()) {
24097+
if (Use.getResNo() != Vector.getResNo())
24098+
continue;
24099+
const SDNode *User = Use.getUser();
24100+
if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24101+
(!User->hasOneUse() ||
24102+
(*User->user_begin())->getOpcode() != ISD::STORE))
24103+
return SDValue();
24104+
}
24105+
}
2414524106

24146-
EVT FPMemVT = EVT::getFloatingPointVT(MemVT.getSizeInBits());
24147-
if (ST->isTruncatingStore() && FPMemVT != FPElemVT) {
24148-
SDValue Trunc = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24149-
FPMemVT, Ext);
24150-
return DAG.getStore(ST->getChain(), DL, Trunc, ST->getBasePtr(),
24151-
ST->getMemOperand());
24107+
SDValue ExtVector = Vector;
24108+
if (!ExtCst || !ExtCst->isZero()) {
24109+
// Handle extracting from lanes != 0.
24110+
SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
24111+
Value.getValueType(), Vector, ExtIdx);
24112+
// FIXME: Using a fixed-size vector for the insertion should not be
24113+
// necessary, but SVE ISEL is missing some folds to avoid fmovs.
24114+
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
24115+
EVT InsertVectorVT = EVT::getVectorVT(
24116+
*DAG.getContext(), ElemVT,
24117+
VectorVT.getVectorElementCount().getKnownMinValue(), false);
24118+
ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InsertVectorVT,
24119+
DAG.getUNDEF(InsertVectorVT), Ext, Zero);
2415224120
}
2415324121

24154-
return DAG.getStore(ST->getChain(), DL, Ext, ST->getBasePtr(),
24122+
EVT FPMemVT = MemVT == MVT::i8
24123+
? MVT::aarch64mfp8
24124+
: EVT::getFloatingPointVT(MemVT.getSizeInBits());
24125+
SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24126+
FPMemVT, ExtVector);
24127+
24128+
return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
2415524129
ST->getMemOperand());
2415624130
}
2415724131

@@ -28878,10 +28852,6 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
2887828852
auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
2887928853
auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
2888028854

28881-
// Can be lowered to a bsub store in ISEL.
28882-
if (VT == MVT::v1i64 && MemVT == MVT::v1i8)
28883-
return SDValue();
28884-
2888528855
if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
2888628856
EVT TruncVT = ContainerVT.changeVectorElementType(
2888728857
Store->getMemoryVT().getVectorElementType());

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4589,11 +4589,11 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
45894589
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
45904590
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
45914591

4592-
// v1i64 -> bsub truncating stores
4593-
def : Pat<(truncstorevi8 v1i64:$VT, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
4594-
(STURBi (aarch64mfp8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, simm9:$offset)>;
4595-
def : Pat<(truncstorevi8 v1i64:$VT, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)),
4596-
(STRBui (aarch64mfp8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, uimm12s4:$offset)>;
4592+
// aarch64mfp8 (bsub) stores
4593+
def : Pat<(store aarch64mfp8:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
4594+
(STURBi FPR8:$Rt, GPR64sp:$Rn, simm9:$offset)>;
4595+
def : Pat<(store aarch64mfp8:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)),
4596+
(STRBui FPR8:$Rt, GPR64sp:$Rn, uimm12s4:$offset)>;
45974597

45984598
// Match stores from lane 0 to the appropriate subreg's store.
45994599
multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,

llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,10 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
7070
; CHECK-NEXT: movi.2d v0, #0000000000000000
7171
; CHECK-NEXT: movi.2d v2, #0000000000000000
7272
; CHECK-NEXT: str wzr, [x0, #88]
73-
; CHECK-NEXT: str xzr, [x0, #80]
7473
; CHECK-NEXT: uaddlv.8h s1, v0
7574
; CHECK-NEXT: stp q0, q0, [x0, #16]
7675
; CHECK-NEXT: stp q0, q0, [x0, #48]
76+
; CHECK-NEXT: str d0, [x0, #80]
7777
; CHECK-NEXT: mov.s v2[0], v1[0]
7878
; CHECK-NEXT: ucvtf.4s v1, v2
7979
; CHECK-NEXT: str q1, [x0]
@@ -146,12 +146,13 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
146146
; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
147147
; CHECK: ; %bb.0: ; %entry
148148
; CHECK-NEXT: movi.2d v0, #0000000000000000
149-
; CHECK-NEXT: str xzr, [x0, #16]
149+
; CHECK-NEXT: movi.2d v2, #0000000000000000
150150
; CHECK-NEXT: uaddlv.4s d1, v0
151+
; CHECK-NEXT: str d0, [x0, #16]
151152
; CHECK-NEXT: fmov x8, d1
152153
; CHECK-NEXT: ucvtf s1, x8
153-
; CHECK-NEXT: mov.s v0[0], v1[0]
154-
; CHECK-NEXT: str q0, [x0]
154+
; CHECK-NEXT: mov.s v2[0], v1[0]
155+
; CHECK-NEXT: str q2, [x0]
155156
; CHECK-NEXT: ret
156157

157158
entry:

0 commit comments

Comments
 (0)