Skip to content

Commit a7dafea

Browse files
authored
[SDAG] Allow folding stack slots into sincos/frexp in more cases (llvm#118117)
This adds a new helper `canFoldStoreIntoLibCallOutputPointers()` to check that it is safe to fold a store into a node that will expand to a library call that takes output pointers. This requires checking for two (independent) properties: 1. The store is not within a CALLSEQ_START..CALLSEQ_END pair * If it is, the expansion would lead to nested call sequences (which is invalid) 2. The node does not appear as a predecessor to the store * If it does, attempting to merge the store into the call would result in a cycle in the DAG These two properties are checked as part of the same traversal in `canFoldStoreIntoLibCallOutputPointers()`
1 parent f7988a3 commit a7dafea

File tree

9 files changed

+381
-185
lines changed

9 files changed

+381
-185
lines changed

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,9 @@ class SelectionDAG {
455455
// Maximum depth for recursive analysis such as computeKnownBits, etc.
456456
static constexpr unsigned MaxRecursionDepth = 6;
457457

458+
// Returns the maximum steps for SDNode->hasPredecessor() like searches.
459+
static unsigned getHasPredecessorMaxSteps();
460+
458461
explicit SelectionDAG(const TargetMachine &TM, CodeGenOptLevel);
459462
SelectionDAG(const SelectionDAG &) = delete;
460463
SelectionDAG &operator=(const SelectionDAG &) = delete;

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -153,12 +153,6 @@ static cl::opt<bool> EnableVectorFCopySignExtendRound(
153153
"combiner-vector-fcopysign-extend-round", cl::Hidden, cl::init(false),
154154
cl::desc(
155155
"Enable merging extends and rounds into FCOPYSIGN on vector types"));
156-
157-
static cl::opt<unsigned int>
158-
MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192),
159-
cl::desc("DAG combiner limit number of steps when searching DAG "
160-
"for predecessor nodes"));
161-
162156
namespace {
163157

164158
class DAGCombiner {
@@ -18929,6 +18923,7 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
1892918923
// can be folded with this one. We should do this to avoid having to keep
1893018924
// a copy of the original base pointer.
1893118925
SmallVector<SDNode *, 16> OtherUses;
18926+
unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
1893218927
if (isa<ConstantSDNode>(Offset))
1893318928
for (SDNode::use_iterator UI = BasePtr->use_begin(),
1893418929
UE = BasePtr->use_end();
@@ -19093,6 +19088,7 @@ static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
1909319088
return false;
1909419089

1909519090
SmallPtrSet<const SDNode *, 32> Visited;
19091+
unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
1909619092
for (SDNode *Use : BasePtr->uses()) {
1909719093
if (Use == Ptr.getNode())
1909819094
continue;
@@ -19139,6 +19135,7 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
1913919135
// 2) Op must be independent of N, i.e. Op is neither a predecessor
1914019136
// nor a successor of N. Otherwise, if Op is folded that would
1914119137
// create a cycle.
19138+
unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
1914219139
for (SDNode *Op : Ptr->uses()) {
1914319140
// Check for #1.
1914419141
if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 72 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -111,10 +111,17 @@ static cl::opt<int> MaxLdStGlue("ldstmemcpy-glue-max",
111111
cl::desc("Number limit for gluing ld/st of memcpy."),
112112
cl::Hidden, cl::init(0));
113113

114+
static cl::opt<unsigned>
115+
MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192),
116+
cl::desc("DAG combiner limit number of steps when searching DAG "
117+
"for predecessor nodes"));
118+
114119
static void NewSDValueDbgMsg(SDValue V, StringRef Msg, SelectionDAG *G) {
115120
LLVM_DEBUG(dbgs() << Msg; V.getNode()->dump(G););
116121
}
117122

123+
unsigned SelectionDAG::getHasPredecessorMaxSteps() { return MaxSteps; }
124+
118125
//===----------------------------------------------------------------------===//
119126
// ConstantFPSDNode Class
120127
//===----------------------------------------------------------------------===//
@@ -2474,6 +2481,51 @@ SDValue SelectionDAG::getPartialReduceAdd(SDLoc DL, EVT ReducedTy, SDValue Op1,
24742481
return Subvectors[0];
24752482
}
24762483

2484+
/// Given a store node \p StoreNode, return true if it is safe to fold that node
2485+
/// into \p FPNode, which expands to a library call with output pointers.
2486+
static bool canFoldStoreIntoLibCallOutputPointers(StoreSDNode *StoreNode,
2487+
SDNode *FPNode) {
2488+
SmallVector<const SDNode *, 8> Worklist;
2489+
SmallVector<const SDNode *, 8> DeferredNodes;
2490+
SmallPtrSet<const SDNode *, 16> Visited;
2491+
2492+
// Skip FPNode use by StoreNode (that's the use we want to fold into FPNode).
2493+
for (SDValue Op : StoreNode->ops())
2494+
if (Op.getNode() != FPNode)
2495+
Worklist.push_back(Op.getNode());
2496+
2497+
unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
2498+
while (!Worklist.empty()) {
2499+
const SDNode *Node = Worklist.pop_back_val();
2500+
auto [_, Inserted] = Visited.insert(Node);
2501+
if (!Inserted)
2502+
continue;
2503+
2504+
if (MaxSteps > 0 && Visited.size() >= MaxSteps)
2505+
return false;
2506+
2507+
// Reached the FPNode (would result in a cycle).
2508+
// OR Reached CALLSEQ_START (would result in nested call sequences).
2509+
if (Node == FPNode || Node->getOpcode() == ISD::CALLSEQ_START)
2510+
return false;
2511+
2512+
if (Node->getOpcode() == ISD::CALLSEQ_END) {
2513+
// Defer looking into call sequences (so we can check we're outside one).
2514+
// We still need to look through these for the predecessor check.
2515+
DeferredNodes.push_back(Node);
2516+
continue;
2517+
}
2518+
2519+
for (SDValue Op : Node->ops())
2520+
Worklist.push_back(Op.getNode());
2521+
}
2522+
2523+
// True if we're outside a call sequence and don't have the FPNode as a
2524+
// predecessor. No cycles or nested call sequences possible.
2525+
return !SDNode::hasPredecessorHelper(FPNode, Visited, DeferredNodes,
2526+
MaxSteps);
2527+
}
2528+
24772529
bool SelectionDAG::expandMultipleResultFPLibCall(
24782530
RTLIB::Libcall LC, SDNode *Node, SmallVectorImpl<SDValue> &Results,
24792531
std::optional<unsigned> CallRetResNo) {
@@ -2502,26 +2554,35 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
25022554

25032555
// Find users of the node that store the results (and share input chains). The
25042556
// destination pointers can be used instead of creating stack allocations.
2505-
// FIXME: This should allow stores with the same chains (not just the entry
2506-
// chain), but there's a risk the store is within a (CALLSEQ_START,
2507-
// CALLSEQ_END) pair, which after this expansion will lead to nested call
2508-
// sequences.
2509-
SDValue InChain = getEntryNode();
2557+
SDValue StoresInChain;
25102558
SmallVector<StoreSDNode *, 2> ResultStores(NumResults);
25112559
for (SDNode *User : Node->uses()) {
25122560
if (!ISD::isNormalStore(User))
25132561
continue;
25142562
auto *ST = cast<StoreSDNode>(User);
25152563
SDValue StoreValue = ST->getValue();
25162564
unsigned ResNo = StoreValue.getResNo();
2565+
// Ensure the store corresponds to an output pointer.
2566+
if (CallRetResNo == ResNo)
2567+
continue;
2568+
// Ensure the store to the default address space and not atomic or volatile.
2569+
if (!ST->isSimple() || ST->getAddressSpace() != 0)
2570+
continue;
2571+
// Ensure all store chains are the same (so they don't alias).
2572+
if (StoresInChain && ST->getChain() != StoresInChain)
2573+
continue;
2574+
// Ensure the store is properly aligned.
25172575
Type *StoreType = StoreValue.getValueType().getTypeForEVT(Ctx);
2518-
if (CallRetResNo == ResNo || !ST->isSimple() ||
2519-
ST->getAddressSpace() != 0 ||
2520-
ST->getAlign() <
2521-
getDataLayout().getABITypeAlign(StoreType->getScalarType()) ||
2522-
ST->getChain() != InChain)
2576+
if (ST->getAlign() <
2577+
getDataLayout().getABITypeAlign(StoreType->getScalarType()))
2578+
continue;
2579+
// Avoid:
2580+
// 1. Creating cyclic dependencies.
2581+
// 2. Expanding the node to a call within a call sequence.
2582+
if (!canFoldStoreIntoLibCallOutputPointers(ST, Node))
25232583
continue;
25242584
ResultStores[ResNo] = ST;
2585+
StoresInChain = ST->getChain();
25252586
}
25262587

25272588
TargetLowering::ArgListTy Args;
@@ -2563,6 +2624,7 @@ bool SelectionDAG::expandMultipleResultFPLibCall(
25632624
Type *RetType = CallRetResNo.has_value()
25642625
? Node->getValueType(*CallRetResNo).getTypeForEVT(Ctx)
25652626
: Type::getVoidTy(Ctx);
2627+
SDValue InChain = StoresInChain ? StoresInChain : getEntryNode();
25662628
SDValue Callee = getExternalSymbol(VD ? VD->getVectorFnName().data() : LCName,
25672629
TLI->getPointerTy(getDataLayout()));
25682630
TargetLowering::CallLoweringInfo CLI(*this);

llvm/test/CodeGen/AArch64/sincos-stack-slots.ll

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,3 +253,37 @@ entry:
253253
store double %cos, ptr %out_cos, align 4
254254
ret void
255255
}
256+
257+
declare void @foo(ptr, ptr)
258+
259+
define void @can_fold_with_call_in_chain(float %x, ptr noalias %a, ptr noalias %b) {
260+
; CHECK-LABEL: can_fold_with_call_in_chain:
261+
; CHECK: // %bb.0: // %entry
262+
; CHECK-NEXT: str d8, [sp, #-32]! // 8-byte Folded Spill
263+
; CHECK-NEXT: str x30, [sp, #8] // 8-byte Folded Spill
264+
; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
265+
; CHECK-NEXT: .cfi_def_cfa_offset 32
266+
; CHECK-NEXT: .cfi_offset w19, -8
267+
; CHECK-NEXT: .cfi_offset w20, -16
268+
; CHECK-NEXT: .cfi_offset w30, -24
269+
; CHECK-NEXT: .cfi_offset b8, -32
270+
; CHECK-NEXT: mov x19, x1
271+
; CHECK-NEXT: mov x20, x0
272+
; CHECK-NEXT: fmov s8, s0
273+
; CHECK-NEXT: bl foo
274+
; CHECK-NEXT: fmov s0, s8
275+
; CHECK-NEXT: mov x0, x20
276+
; CHECK-NEXT: mov x1, x19
277+
; CHECK-NEXT: bl sincosf
278+
; CHECK-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
279+
; CHECK-NEXT: ldr x30, [sp, #8] // 8-byte Folded Reload
280+
; CHECK-NEXT: ldr d8, [sp], #32 // 8-byte Folded Reload
281+
; CHECK-NEXT: ret
282+
entry:
283+
%sin = tail call float @llvm.sin.f32(float %x)
284+
%cos = tail call float @llvm.cos.f32(float %x)
285+
call void @foo(ptr %a, ptr %b)
286+
store float %sin, ptr %a, align 4
287+
store float %cos, ptr %b, align 4
288+
ret void
289+
}

llvm/test/CodeGen/PowerPC/f128-arith.ll

Lines changed: 10 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1365,45 +1365,33 @@ define dso_local fp128 @qpFREXP(ptr %a, ptr %b) {
13651365
; CHECK-LABEL: qpFREXP:
13661366
; CHECK: # %bb.0: # %entry
13671367
; CHECK-NEXT: mflr r0
1368-
; CHECK-NEXT: .cfi_def_cfa_offset 64
1368+
; CHECK-NEXT: stdu r1, -32(r1)
1369+
; CHECK-NEXT: std r0, 48(r1)
1370+
; CHECK-NEXT: .cfi_def_cfa_offset 32
13691371
; CHECK-NEXT: .cfi_offset lr, 16
1370-
; CHECK-NEXT: .cfi_offset r30, -16
1371-
; CHECK-NEXT: std r30, -16(r1) # 8-byte Folded Spill
1372-
; CHECK-NEXT: stdu r1, -64(r1)
1373-
; CHECK-NEXT: std r0, 80(r1)
1374-
; CHECK-NEXT: addi r5, r1, 44
1375-
; CHECK-NEXT: mr r30, r4
13761372
; CHECK-NEXT: lxv v2, 0(r3)
1373+
; CHECK-NEXT: mr r5, r4
13771374
; CHECK-NEXT: bl frexpf128
13781375
; CHECK-NEXT: nop
1379-
; CHECK-NEXT: lwz r3, 44(r1)
1380-
; CHECK-NEXT: stw r3, 0(r30)
1381-
; CHECK-NEXT: addi r1, r1, 64
1376+
; CHECK-NEXT: addi r1, r1, 32
13821377
; CHECK-NEXT: ld r0, 16(r1)
1383-
; CHECK-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
13841378
; CHECK-NEXT: mtlr r0
13851379
; CHECK-NEXT: blr
13861380
;
13871381
; CHECK-P8-LABEL: qpFREXP:
13881382
; CHECK-P8: # %bb.0: # %entry
13891383
; CHECK-P8-NEXT: mflr r0
1390-
; CHECK-P8-NEXT: .cfi_def_cfa_offset 64
1384+
; CHECK-P8-NEXT: stdu r1, -32(r1)
1385+
; CHECK-P8-NEXT: std r0, 48(r1)
1386+
; CHECK-P8-NEXT: .cfi_def_cfa_offset 32
13911387
; CHECK-P8-NEXT: .cfi_offset lr, 16
1392-
; CHECK-P8-NEXT: .cfi_offset r30, -16
1393-
; CHECK-P8-NEXT: std r30, -16(r1) # 8-byte Folded Spill
1394-
; CHECK-P8-NEXT: stdu r1, -64(r1)
1395-
; CHECK-P8-NEXT: std r0, 80(r1)
1396-
; CHECK-P8-NEXT: addi r5, r1, 44
1397-
; CHECK-P8-NEXT: mr r30, r4
13981388
; CHECK-P8-NEXT: lxvd2x vs0, 0, r3
1389+
; CHECK-P8-NEXT: mr r5, r4
13991390
; CHECK-P8-NEXT: xxswapd v2, vs0
14001391
; CHECK-P8-NEXT: bl frexpf128
14011392
; CHECK-P8-NEXT: nop
1402-
; CHECK-P8-NEXT: lwz r3, 44(r1)
1403-
; CHECK-P8-NEXT: stw r3, 0(r30)
1404-
; CHECK-P8-NEXT: addi r1, r1, 64
1393+
; CHECK-P8-NEXT: addi r1, r1, 32
14051394
; CHECK-P8-NEXT: ld r0, 16(r1)
1406-
; CHECK-P8-NEXT: ld r30, -16(r1) # 8-byte Folded Reload
14071395
; CHECK-P8-NEXT: mtlr r0
14081396
; CHECK-P8-NEXT: blr
14091397
entry:

0 commit comments

Comments
 (0)