Skip to content

Commit 86cf4ed

Browse files
authored
[ARM] Speedups for CombineBaseUpdate. (#129725)
This attempts to put limits onto CombineBaseUpdate for degenerate cases like #127477. The biggest change is to add a limit to the number of base updates to check in CombineBaseUpdate. 64 is hopefully plenty high enough for most runtime unrolled loops to generate postinc where they are beneficial. It also moves the check for isValidBaseUpdate later so that it only happens if we will generate a valid instruction. The 1024 limit to hasPredecessorHelper comes from the X86 backend, which uses the same limit. I haven't added a test case as it would need to be very big and my attempts at generating a smaller version did not show anything useful. Fixes #127477.
1 parent b941d90 commit 86cf4ed

File tree

1 file changed

+38
-29
lines changed

1 file changed

+38
-29
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 38 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,11 @@ MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
149149
cl::desc("Maximum interleave factor for MVE VLDn to generate."),
150150
cl::init(2));
151151

152+
cl::opt<unsigned> ArmMaxBaseUpdatesToCheck(
153+
"arm-max-base-updates-to-check", cl::Hidden,
154+
cl::desc("Maximum number of base-updates to check generating postindex."),
155+
cl::init(64));
156+
152157
/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
153158
constexpr MVT FlagsVT = MVT::i32;
154159

@@ -15865,6 +15870,22 @@ struct BaseUpdateUser {
1586515870
unsigned ConstInc;
1586615871
};
1586715872

15873+
static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
15874+
// Check that the add is independent of the load/store.
15875+
// Otherwise, folding it would create a cycle. Search through Addr
15876+
// as well, since the User may not be a direct user of Addr and
15877+
// only share a base pointer.
15878+
SmallPtrSet<const SDNode *, 32> Visited;
15879+
SmallVector<const SDNode *, 16> Worklist;
15880+
Worklist.push_back(N);
15881+
Worklist.push_back(User);
15882+
const unsigned MaxSteps = 1024;
15883+
if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15884+
SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15885+
return false;
15886+
return true;
15887+
}
15888+
1586815889
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
1586915890
struct BaseUpdateUser &User,
1587015891
bool SimpleConstIncOnly,
@@ -16066,6 +16087,9 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target,
1606616087
if (SimpleConstIncOnly && User.ConstInc != NumBytes)
1606716088
return false;
1606816089

16090+
if (!isValidBaseUpdate(N, User.N))
16091+
return false;
16092+
1606916093
// OK, we found an ADD we can fold into the base update.
1607016094
// Now, create a _UPD node, taking care of not breaking alignment.
1607116095

@@ -16214,21 +16238,6 @@ static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) {
1621416238
}
1621516239
}
1621616240

16217-
static bool isValidBaseUpdate(SDNode *N, SDNode *User) {
16218-
// Check that the add is independent of the load/store.
16219-
// Otherwise, folding it would create a cycle. Search through Addr
16220-
// as well, since the User may not be a direct user of Addr and
16221-
// only share a base pointer.
16222-
SmallPtrSet<const SDNode *, 32> Visited;
16223-
SmallVector<const SDNode *, 16> Worklist;
16224-
Worklist.push_back(N);
16225-
Worklist.push_back(User);
16226-
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
16227-
SDNode::hasPredecessorHelper(User, Visited, Worklist))
16228-
return false;
16229-
return true;
16230-
}
16231-
1623216241
/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
1623316242
/// NEON load/store intrinsics, and generic vector load/stores, to merge
1623416243
/// base address updates.
@@ -16242,6 +16251,10 @@ static SDValue CombineBaseUpdate(SDNode *N,
1624216251
const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
1624316252
BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
1624416253

16254+
// Limit the number of possible base-updates we look at to prevent degenerate
16255+
// cases.
16256+
unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16257+
1624516258
SDValue Addr = N->getOperand(AddrOpIdx);
1624616259

1624716260
SmallVector<BaseUpdateUser, 8> BaseUpdates;
@@ -16256,8 +16269,11 @@ static SDValue CombineBaseUpdate(SDNode *N,
1625616269
unsigned ConstInc =
1625716270
getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
1625816271

16259-
if (ConstInc || User->getOpcode() == ISD::ADD)
16272+
if (ConstInc || User->getOpcode() == ISD::ADD) {
1626016273
BaseUpdates.push_back({User, Inc, ConstInc});
16274+
if (BaseUpdates.size() >= MaxBaseUpdates)
16275+
break;
16276+
}
1626116277
}
1626216278

1626316279
// If the address is a constant pointer increment itself, find
@@ -16284,27 +16300,19 @@ static SDValue CombineBaseUpdate(SDNode *N,
1628416300
unsigned NewConstInc = UserOffset - Offset;
1628516301
SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
1628616302
BaseUpdates.push_back({User, NewInc, NewConstInc});
16303+
if (BaseUpdates.size() >= MaxBaseUpdates)
16304+
break;
1628716305
}
1628816306
}
1628916307

1629016308
// Try to fold the load/store with an update that matches memory
1629116309
// access size. This should work well for sequential loads.
16292-
//
16293-
// Filter out invalid updates as well.
1629416310
unsigned NumValidUpd = BaseUpdates.size();
16295-
for (unsigned I = 0; I < NumValidUpd;) {
16311+
for (unsigned I = 0; I < NumValidUpd; I++) {
1629616312
BaseUpdateUser &User = BaseUpdates[I];
16297-
if (!isValidBaseUpdate(N, User.N)) {
16298-
--NumValidUpd;
16299-
std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]);
16300-
continue;
16301-
}
16302-
1630316313
if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
1630416314
return SDValue();
16305-
++I;
1630616315
}
16307-
BaseUpdates.resize(NumValidUpd);
1630816316

1630916317
// Try to fold with other users. Non-constant updates are considered
1631016318
// first, and constant updates are sorted to not break a sequence of
@@ -16360,8 +16368,9 @@ static SDValue PerformMVEVLDCombine(SDNode *N,
1636016368
Visited.insert(Addr.getNode());
1636116369
Worklist.push_back(N);
1636216370
Worklist.push_back(User);
16363-
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
16364-
SDNode::hasPredecessorHelper(User, Visited, Worklist))
16371+
const unsigned MaxSteps = 1024;
16372+
if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16373+
SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
1636516374
continue;
1636616375

1636716376
// Find the new opcode for the updating load/store.

0 commit comments

Comments
 (0)