Skip to content

Commit 6857792

Browse files
committed
Generalize fold
1 parent 36d54fe commit 6857792

35 files changed

+586
-487
lines changed

llvm/lib/CodeGen/ValueTypes.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,8 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
223223
case MVT::x86mmx: return llvm::FixedVectorType::get(llvm::IntegerType::get(Context, 64), 1);
224224
case MVT::aarch64svcount:
225225
return TargetExtType::get(Context, "aarch64.svcount");
226+
case MVT::aarch64mfp8:
227+
return VectorType::get(IntegerType::get(Context, 8), ElementCount::getFixed(1));
226228
case MVT::x86amx: return Type::getX86_AMXTy(Context);
227229
case MVT::i64x8: return IntegerType::get(Context, 512);
228230
case MVT::amdgpuBufferFatPointer: return IntegerType::get(Context, 160);

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 47 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,9 +1394,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
13941394
}
13951395
}
13961396

1397-
// v1i64 -> v1i8 truncstore represents a bsub FPR8 store.
1398-
setTruncStoreAction(MVT::v1i64, MVT::v1i8, Legal);
1399-
14001397
for (auto Op :
14011398
{ISD::FFLOOR, ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT, ISD::FTRUNC,
14021399
ISD::FROUND, ISD::FROUNDEVEN, ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE,
@@ -23936,6 +23933,8 @@ static SDValue combineI8TruncStore(StoreSDNode *ST, SelectionDAG &DAG,
2393623933
static unsigned getFPSubregForVT(EVT VT) {
2393723934
assert(VT.isSimple() && "Expected simple VT");
2393823935
switch (VT.getSimpleVT().SimpleTy) {
23936+
case MVT::aarch64mfp8:
23937+
return AArch64::bsub;
2393923938
case MVT::f16:
2394023939
return AArch64::hsub;
2394123940
case MVT::f32:
@@ -23947,22 +23946,6 @@ static unsigned getFPSubregForVT(EVT VT) {
2394723946
}
2394823947
}
2394923948

23950-
static EVT get64BitVector(EVT ElVT) {
23951-
assert(ElVT.isSimple() && "Expected simple VT");
23952-
switch (ElVT.getSimpleVT().SimpleTy) {
23953-
case MVT::i8:
23954-
return MVT::v8i8;
23955-
case MVT::i16:
23956-
return MVT::v4i16;
23957-
case MVT::i32:
23958-
return MVT::v2i32;
23959-
case MVT::i64:
23960-
return MVT::v1i64;
23961-
default:
23962-
llvm_unreachable("Unexpected VT!");
23963-
}
23964-
}
23965-
2396623949
static SDValue performSTORECombine(SDNode *N,
2396723950
TargetLowering::DAGCombinerInfo &DCI,
2396823951
SelectionDAG &DAG,
@@ -24041,72 +24024,63 @@ static SDValue performSTORECombine(SDNode *N,
2404124024
SDValue ExtIdx = Value.getOperand(1);
2404224025
EVT VectorVT = Vector.getValueType();
2404324026
EVT ElemVT = VectorVT.getVectorElementType();
24027+
2404424028
if (!ValueVT.isInteger())
2404524029
return SDValue();
2404624030
if (ValueVT != MemVT && !ST->isTruncatingStore())
2404724031
return SDValue();
2404824032

24049-
if (MemVT == MVT::i8) {
24050-
auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24051-
if (Subtarget->isNeonAvailable() &&
24052-
(VectorVT == MVT::v8i8 || VectorVT == MVT::v16i8) && ExtCst &&
24053-
!ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) {
24054-
// These can lower to st1.b, which is preferable if we're unlikely to
24055-
// fold the addressing into the store.
24056-
return SDValue();
24057-
}
24058-
24059-
// Lower as truncstore of v1i64 -> v1i8 (which can lower to a bsub store).
24060-
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
24061-
SDValue ExtVector;
24062-
EVT VecVT64 = get64BitVector(ElemVT);
24063-
if (ExtCst && ExtCst->isZero()) {
24064-
ExtVector =
24065-
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT64, Vector, Zero);
24066-
} else {
24067-
SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
24068-
Value.getValueType(), Vector, ExtIdx);
24069-
ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecVT64,
24070-
DAG.getUNDEF(VecVT64), Ext, Zero);
24071-
}
24072-
24073-
SDValue Cast = DAG.getNode(AArch64ISD::NVCAST, DL, MVT::v1i64, ExtVector);
24074-
return DAG.getTruncStore(ST->getChain(), DL, Cast, ST->getBasePtr(),
24075-
MVT::v1i8, ST->getMemOperand());
24076-
}
24077-
24078-
// TODO: Handle storing i8s to wider types.
24079-
if (ElemVT == MVT::i8)
24033+
// This could generate an additional extract if the index is non-zero and
24034+
// the extracted value has multiple uses.
24035+
auto *ExtCst = dyn_cast<ConstantSDNode>(ExtIdx);
24036+
if ((!ExtCst || !ExtCst->isZero()) && !Value.hasOneUse())
2408024037
return SDValue();
2408124038

24082-
// Heuristic: If there are other users of integer scalars extracted from
24083-
// this vector that won't fold into the store -- abandon folding. Applying
24084-
// this fold may extend the vector lifetime and disrupt paired stores.
24085-
for (const auto &Use : Vector->uses()) {
24086-
if (Use.getResNo() != Vector.getResNo())
24087-
continue;
24088-
const SDNode *User = Use.getUser();
24089-
if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24090-
(!User->hasOneUse() ||
24091-
(*User->user_begin())->getOpcode() != ISD::STORE))
24092-
return SDValue();
24039+
if (Subtarget->isNeonAvailable() && ElemVT == MemVT &&
24040+
(VectorVT.is64BitVector() || VectorVT.is128BitVector()) && ExtCst &&
24041+
!ExtCst->isZero() && ST->getBasePtr().getOpcode() != ISD::ADD) {
24042+
// These can lower to st1, which is preferable if we're unlikely to fold
24043+
// the addressing into the store.
24044+
return SDValue();
2409324045
}
2409424046

24095-
EVT FPElemVT = EVT::getFloatingPointVT(ElemVT.getSizeInBits());
24096-
EVT FPVectorVT = VectorVT.changeVectorElementType(FPElemVT);
24097-
SDValue Cast = DAG.getNode(ISD::BITCAST, DL, FPVectorVT, Vector);
24098-
SDValue Ext =
24099-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, FPElemVT, Cast, ExtIdx);
24047+
if (MemVT == MVT::i64 || MemVT == MVT::i32) {
24048+
// Heuristic: If there are other users of w/x integer scalars extracted
24049+
// from this vector that won't fold into the store -- abandon folding.
24050+
// Applying this fold may disrupt paired stores.
24051+
for (const auto &Use : Vector->uses()) {
24052+
if (Use.getResNo() != Vector.getResNo())
24053+
continue;
24054+
const SDNode *User = Use.getUser();
24055+
if (User->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
24056+
(!User->hasOneUse() ||
24057+
(*User->user_begin())->getOpcode() != ISD::STORE))
24058+
return SDValue();
24059+
}
24060+
}
2410024061

24101-
EVT FPMemVT = EVT::getFloatingPointVT(MemVT.getSizeInBits());
24102-
if (ST->isTruncatingStore() && FPMemVT != FPElemVT) {
24103-
SDValue Trunc = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24104-
FPMemVT, Ext);
24105-
return DAG.getStore(ST->getChain(), DL, Trunc, ST->getBasePtr(),
24106-
ST->getMemOperand());
24062+
SDValue ExtVector = Vector;
24063+
if (!ExtCst || !ExtCst->isZero()) {
24064+
// Handle extracting from lanes != 0.
24065+
SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
24066+
Value.getValueType(), Vector, ExtIdx);
24067+
// FIXME: Using a fixed-size vector for the insertion should not be
24068+
// necessary, but SVE ISEL is missing some folds to avoid fmovs.
24069+
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
24070+
EVT InsertVectorVT = EVT::getVectorVT(
24071+
*DAG.getContext(), ElemVT,
24072+
VectorVT.getVectorElementCount().getKnownMinValue(), false);
24073+
ExtVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, InsertVectorVT,
24074+
DAG.getUNDEF(InsertVectorVT), Ext, Zero);
2410724075
}
2410824076

24109-
return DAG.getStore(ST->getChain(), DL, Ext, ST->getBasePtr(),
24077+
EVT FPMemVT = MemVT == MVT::i8
24078+
? MVT::aarch64mfp8
24079+
: EVT::getFloatingPointVT(MemVT.getSizeInBits());
24080+
SDValue FPSubreg = DAG.getTargetExtractSubreg(getFPSubregForVT(FPMemVT), DL,
24081+
FPMemVT, ExtVector);
24082+
24083+
return DAG.getStore(ST->getChain(), DL, FPSubreg, ST->getBasePtr(),
2411024084
ST->getMemOperand());
2411124085
}
2411224086

@@ -28861,10 +28835,6 @@ SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
2886128835
auto Pg = getPredicateForFixedLengthVector(DAG, DL, VT);
2886228836
auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
2886328837

28864-
// Can be lowered to a bsub store in ISEL.
28865-
if (VT == MVT::v1i64 && MemVT == MVT::v1i8)
28866-
return SDValue();
28867-
2886828838
if (VT.isFloatingPoint() && Store->isTruncatingStore()) {
2886928839
EVT TruncVT = ContainerVT.changeVectorElementType(
2887028840
Store->getMemoryVT().getVectorElementType());

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4601,11 +4601,11 @@ def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
46014601
def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
46024602
(STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
46034603

4604-
// v1i64 -> bsub truncating stores
4605-
def : Pat<(truncstorevi8 v1i64:$VT, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
4606-
(STURBi (aarch64mfp8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, simm9:$offset)>;
4607-
def : Pat<(truncstorevi8 v1i64:$VT, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)),
4608-
(STRBui (aarch64mfp8 (EXTRACT_SUBREG v1i64:$VT, bsub)), GPR64sp:$Rn, uimm12s4:$offset)>;
4604+
// aarch64mfp8 (bsub) stores
4605+
def : Pat<(store aarch64mfp8:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
4606+
(STURBi FPR8:$Rt, GPR64sp:$Rn, simm9:$offset)>;
4607+
def : Pat<(store aarch64mfp8:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s4:$offset)),
4608+
(STRBui FPR8:$Rt, GPR64sp:$Rn, uimm12s4:$offset)>;
46094609

46104610
// Match stores from lane 0 to the appropriate subreg's store.
46114611
multiclass VecStoreULane0Pat<SDPatternOperator StoreOp,

llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -70,10 +70,10 @@ define void @insert_vec_v23i32_uaddlv_from_v8i16(ptr %0) {
7070
; CHECK-NEXT: movi.2d v0, #0000000000000000
7171
; CHECK-NEXT: movi.2d v2, #0000000000000000
7272
; CHECK-NEXT: str wzr, [x0, #88]
73-
; CHECK-NEXT: str xzr, [x0, #80]
7473
; CHECK-NEXT: uaddlv.8h s1, v0
7574
; CHECK-NEXT: stp q0, q0, [x0, #16]
7675
; CHECK-NEXT: stp q0, q0, [x0, #48]
76+
; CHECK-NEXT: str d0, [x0, #80]
7777
; CHECK-NEXT: mov.s v2[0], v1[0]
7878
; CHECK-NEXT: ucvtf.4s v1, v2
7979
; CHECK-NEXT: str q1, [x0]
@@ -146,12 +146,13 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
146146
; CHECK-LABEL: insert_vec_v6i64_uaddlv_from_v4i32:
147147
; CHECK: ; %bb.0: ; %entry
148148
; CHECK-NEXT: movi.2d v0, #0000000000000000
149-
; CHECK-NEXT: str xzr, [x0, #16]
149+
; CHECK-NEXT: movi.2d v2, #0000000000000000
150150
; CHECK-NEXT: uaddlv.4s d1, v0
151+
; CHECK-NEXT: str d0, [x0, #16]
151152
; CHECK-NEXT: fmov x8, d1
152153
; CHECK-NEXT: ucvtf s1, x8
153-
; CHECK-NEXT: mov.s v0[0], v1[0]
154-
; CHECK-NEXT: str q0, [x0]
154+
; CHECK-NEXT: mov.s v2[0], v1[0]
155+
; CHECK-NEXT: str q2, [x0]
155156
; CHECK-NEXT: ret
156157

157158
entry:

0 commit comments

Comments
 (0)