-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[ARM] Speedups for CombineBaseUpdate. #129725
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -149,6 +149,11 @@ MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, | |
cl::desc("Maximum interleave factor for MVE VLDn to generate."), | ||
cl::init(2)); | ||
|
||
cl::opt<unsigned> ArmMaxBaseUpdatesToCheck( | ||
"arm-max-base-updates-to-check", cl::Hidden, | ||
cl::desc("Maximum number of base-updates to check generating postindex."), | ||
cl::init(64)); | ||
|
||
/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV). | ||
constexpr MVT FlagsVT = MVT::i32; | ||
|
||
|
@@ -15865,6 +15870,22 @@ struct BaseUpdateUser { | |
unsigned ConstInc; | ||
}; | ||
|
||
static bool isValidBaseUpdate(SDNode *N, SDNode *User) { | ||
// Check that the add is independent of the load/store. | ||
// Otherwise, folding it would create a cycle. Search through Addr | ||
// as well, since the User may not be a direct user of Addr and | ||
// only share a base pointer. | ||
SmallPtrSet<const SDNode *, 32> Visited; | ||
SmallVector<const SDNode *, 16> Worklist; | ||
Worklist.push_back(N); | ||
Worklist.push_back(User); | ||
const unsigned MaxSteps = 1024; | ||
if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) || | ||
SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps)) | ||
return false; | ||
return true; | ||
} | ||
|
||
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, | ||
struct BaseUpdateUser &User, | ||
bool SimpleConstIncOnly, | ||
|
@@ -16066,6 +16087,9 @@ static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, | |
if (SimpleConstIncOnly && User.ConstInc != NumBytes) | ||
return false; | ||
|
||
if (!isValidBaseUpdate(N, User.N)) | ||
return false; | ||
|
||
// OK, we found an ADD we can fold into the base update. | ||
// Now, create a _UPD node, taking care of not breaking alignment. | ||
|
||
|
@@ -16214,21 +16238,6 @@ static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc) { | |
} | ||
} | ||
|
||
static bool isValidBaseUpdate(SDNode *N, SDNode *User) { | ||
// Check that the add is independent of the load/store. | ||
// Otherwise, folding it would create a cycle. Search through Addr | ||
// as well, since the User may not be a direct user of Addr and | ||
// only share a base pointer. | ||
SmallPtrSet<const SDNode *, 32> Visited; | ||
SmallVector<const SDNode *, 16> Worklist; | ||
Worklist.push_back(N); | ||
Worklist.push_back(User); | ||
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || | ||
SDNode::hasPredecessorHelper(User, Visited, Worklist)) | ||
return false; | ||
return true; | ||
} | ||
|
||
/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, | ||
/// NEON load/store intrinsics, and generic vector load/stores, to merge | ||
/// base address updates. | ||
|
@@ -16242,6 +16251,10 @@ static SDValue CombineBaseUpdate(SDNode *N, | |
const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1); | ||
BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx}; | ||
|
||
// Limit the number of possible base-updates we look at to prevent degenerate | ||
// cases. | ||
unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck; | ||
|
||
SDValue Addr = N->getOperand(AddrOpIdx); | ||
|
||
SmallVector<BaseUpdateUser, 8> BaseUpdates; | ||
|
@@ -16256,8 +16269,11 @@ static SDValue CombineBaseUpdate(SDNode *N, | |
unsigned ConstInc = | ||
getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG); | ||
|
||
if (ConstInc || User->getOpcode() == ISD::ADD) | ||
if (ConstInc || User->getOpcode() == ISD::ADD) { | ||
BaseUpdates.push_back({User, Inc, ConstInc}); | ||
if (BaseUpdates.size() >= MaxBaseUpdates) | ||
break; | ||
} | ||
} | ||
|
||
// If the address is a constant pointer increment itself, find | ||
|
@@ -16284,27 +16300,19 @@ static SDValue CombineBaseUpdate(SDNode *N, | |
unsigned NewConstInc = UserOffset - Offset; | ||
SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32); | ||
BaseUpdates.push_back({User, NewInc, NewConstInc}); | ||
if (BaseUpdates.size() >= MaxBaseUpdates) | ||
break; | ||
} | ||
} | ||
|
||
// Try to fold the load/store with an update that matches memory | ||
// access size. This should work well for sequential loads. | ||
// | ||
// Filter out invalid updates as well. | ||
unsigned NumValidUpd = BaseUpdates.size(); | ||
for (unsigned I = 0; I < NumValidUpd;) { | ||
for (unsigned I = 0; I < NumValidUpd; I++) { | ||
BaseUpdateUser &User = BaseUpdates[I]; | ||
if (!isValidBaseUpdate(N, User.N)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea. The downside is that there are two calls to TryCombineBaseUpdate, so we potentially call isValidBaseUpdate twice. This only happens if we can convert an instruction to post increment, but there is a dependency on operands that we cannot break. Shouldn't be a common case, and with MaxSteps parameter being set, it is not as bad for compile time. |
||
--NumValidUpd; | ||
std::swap(BaseUpdates[I], BaseUpdates[NumValidUpd]); | ||
continue; | ||
} | ||
|
||
if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI)) | ||
return SDValue(); | ||
++I; | ||
} | ||
BaseUpdates.resize(NumValidUpd); | ||
|
||
// Try to fold with other users. Non-constant updates are considered | ||
// first, and constant updates are sorted to not break a sequence of | ||
|
@@ -16360,8 +16368,9 @@ static SDValue PerformMVEVLDCombine(SDNode *N, | |
Visited.insert(Addr.getNode()); | ||
Worklist.push_back(N); | ||
Worklist.push_back(User); | ||
if (SDNode::hasPredecessorHelper(N, Visited, Worklist) || | ||
SDNode::hasPredecessorHelper(User, Visited, Worklist)) | ||
const unsigned MaxSteps = 1024; | ||
if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) || | ||
SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps)) | ||
continue; | ||
|
||
// Find the new opcode for the updating load/store. | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Other than acting as a place to put the comment, couldn't we just use ArmMaxBaseUpdatesToCheck inline in the two places it is used?
Don't have a strong opinion on that, happy to keep this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I added this mostly as a place to put the comment. It could also help to not dereference the global multiple times, but that likely isn't a huge issue.