Skip to content

SESE: deal with the case where outer loops are moved into the current loop. #20222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Nov 2, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
161 changes: 126 additions & 35 deletions lib/SILOptimizer/Mandatory/TFCanonicalizeCFG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,11 +420,13 @@ class BasicBlockCloner : public SILClonerWithScopes<BasicBlockCloner> {
"times during SESE cloning.");
}

/// Clone the body of `loop` starting from `startBlock` and nest the cloned
/// fragment into the parent loop. If `startBlock` is the same as the header
/// of `loop`, we clone the entire loop including the back edge. Otherwise,
/// we clone one iteration of the loop body without the back edge.
SILLoop *cloneLoop(SILLoopInfo *LI, SILLoop *loop, SILBasicBlock *startBlock) {
/// Utility to unroll one iteration of the loop or to clone the entire loop.
/// - If `startBlock` is the same as the header of `loop`, we clone the
/// entire loop including the back edge.
/// - Otherwise, we unroll one iteration of the loop body starting from
/// `startBlock' to the latch.
/// The unrolled or cloned version is nested into the parent loop.
SILLoop *cloneOrUnrollLoop(SILLoopInfo *LI, SILLoop *loop, SILBasicBlock *startBlock) {
llvm::DenseMap<SILLoop*, SILLoop*> loopClones;
// This is for convenience as top-level loops have nullptr for parent loop.
loopClones[nullptr] = nullptr;
Expand Down Expand Up @@ -514,24 +516,48 @@ class BasicBlockCloner : public SILClonerWithScopes<BasicBlockCloner> {

// A helper class to transform a loop to have a single exit from the header.
class SingleExitLoopTransformer {
public:
SingleExitLoopTransformer(GraphFunctionDeviceInfo *deviceInfo,
SILLoopInfo *LI, DominanceInfo *DI, SILLoop *loop,
PostDominanceInfo *PDI)
: deviceInfo(deviceInfo), DI(DI), PDI(PDI), LI(LI), loop(loop),
header(loop->getHeader()), preheader(loop->getLoopPreheader()),
latch(loop->getLoopLatch()), currentFn(header->getParent()),
oldHeaderNumArgs(header->getNumArguments()), hasUndefsAtPreheader(false) {
assert(preheader && "Canonicalization should have given us one preheader");
assert(latch && "Canonicalization should have given us one latch block");
initialize();
public:
// Convert the given loop into a SESE form. Returns false if the loop was
// already in SESE form. Otherwise, returns true.
static bool doIt(GraphFunctionDeviceInfo *deviceInfo, SILLoopInfo *LI,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

document the return value

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

DominanceInfo *DI, SILLoop *loop, PostDominanceInfo *PDI) {
SingleExitLoopTransformer transformer(deviceInfo, LI, DI, loop, PDI);
bool loopChanged = transformer.transform();
if (loopChanged) {
// Recalculate dominator information as it is stale now.
DI->recalculate(*transformer.currentFn);
PDI->recalculate(*transformer.currentFn);
}

#ifndef NDEBUG
{
// Verify that the loop is OK after all the transformations.
llvm::DenseSet<const SILLoop *> nestedLoops;
loop->verifyLoopNest(&nestedLoops);
}
#endif
return loopChanged;
}

private:
SingleExitLoopTransformer(GraphFunctionDeviceInfo *deviceInfo,
SILLoopInfo *LI, DominanceInfo *DI, SILLoop *loop,
PostDominanceInfo *PDI)
: deviceInfo(deviceInfo), DI(DI), PDI(PDI), LI(LI), loop(loop),
header(loop->getHeader()), preheader(loop->getLoopPreheader()),
latch(loop->getLoopLatch()), currentFn(header->getParent()),
oldHeaderNumArgs(header->getNumArguments()),
hasUndefsAtPreheader(false) {
assert(preheader &&
"Canonicalization should have given us one preheader");
assert(latch && "Canonicalization should have given us one latch block");
initialize();
}

/// Transforms the loop to ensure it has a single exit from the header.
/// Returns true if the CFG was changed.
bool transform();

private:
// Helper functions

void initialize();
Expand Down Expand Up @@ -711,6 +737,24 @@ void SingleExitLoopTransformer::ensureSingleExitBlock() {
<< SILPrintContext(llvm::dbgs()).getID(nearestCommonPD)
<< "\n");

// Compute the set of preheaders of loops that are unrelated to our loop w.r.t
// nesting. This will be used when needing to identify cases where a loop
// from outside is moved into the current loop. e.g.,
// while ... {
// if ... {
// for(...) {...} // This should be nested into the while loop.
// break;
// }
// }
// The unrelated loops are those that are not contained within each other.
SmallPtrSet<SILBasicBlock *, 32> unrelatedPreheaders;
for (auto *otherLoop : *LI) {
if (!otherLoop->contains(loop) && !loop->contains(otherLoop)) {
unrelatedPreheaders.insert(otherLoop->getLoopPreheader());
}
}


// Collect all the blocks from each exiting block up to nearest common PD.
SmallPtrSet<SILBasicBlock *, 32> blocksToBeMoved;
for (SILBasicBlock *exitBlock : exitBlockList) {
Expand All @@ -735,6 +779,19 @@ void SingleExitLoopTransformer::ensureSingleExitBlock() {
continue;
}

// Check if `succ` is a preheader of another loop.
SILLoop *succBlockLoop = nullptr;
if (unrelatedPreheaders.count(succ) > 0) {
// We are about a move a loop from outside. Perform canonicalization
// of that loop first.
SILBasicBlock *unrelatedHeader = succ->getSingleSuccessorBlock();
assert(unrelatedHeader &&
"There should be a single successor for a preheader.");
succBlockLoop = LI->getLoopFor(unrelatedHeader);
SingleExitLoopTransformer::doIt(deviceInfo, LI, DI, succBlockLoop,
PDI);
}

if (DI->properlyDominates(header, succ)) {
worklist.insert(succ);
continue;
Expand All @@ -752,6 +809,24 @@ void SingleExitLoopTransformer::ensureSingleExitBlock() {

// Clone the block and rewire the edge.
SILBasicBlock *clonedSucc = cloner.initAndCloneBlock(succ);
// If `succ` is a preheader of an unrelated loop, we will have to clone
// the entire loop now so that we can also incrementally update LoopInfo.
if (succBlockLoop) {
SILLoop *clonedLoop = cloner.cloneOrUnrollLoop(
LI, succBlockLoop, succBlockLoop->getHeader());
changeBranchTarget(clonedSucc->getTerminator(), 0,
clonedLoop->getHeader(), /*preserveArgs*/ true);
// Note that all the nodes of `clonedLoop` should be moved into the
// current loop. We do that here itself as an optimization and also
// because the dominator and post-dominator information for the new
// blocks in `clonedLoop` are stale and cannot be relied upon.
for (SILBasicBlock *bb : clonedLoop->getBlocks()) {
blocksToBeMoved.insert(bb);
}
// Add the header to worklist for processing the exit edge.
// (Other successor edges are already processed above.)
worklist.insert(clonedLoop->getHeader());
}
changeBranchTarget(current->getTerminator(), edgeIdx, clonedSucc,
/*preserveArgs*/ true);
worklist.insert(clonedSucc);
Expand All @@ -771,24 +846,45 @@ void SingleExitLoopTransformer::ensureSingleExitBlock() {

// Update loop info if this belongs to a parent loop.
SILLoop *outsideBlockLoop = LI->getLoopFor(outsideBlock);
if (outsideBlockLoop != nullptr) {
// FIXME: We don't deal with cases where the nodes being moved in
// belong to another loop yet. e.g.,
if (outsideBlockLoop == nullptr) {
// outsideBlock is not part of any other loop. Simply add it to our loop.
loop->addBasicBlockToLoop(outsideBlock, LI->getBase());
} else {
// We deal with the case where the nodes being moved in
// belong to another loop. e.g.,
// while ... {
// if ... {
// for(...) {...}
// break;
// }
// }
// Check that `loop` is nested within `reachableLoop`.
assert(outsideBlockLoop->contains(loop) &&
"Nodes being moved belong to a non-nested loop.");
// Move the node into our loop.
outsideBlockLoop->removeBlockFromLoop(outsideBlock);
LI->changeLoopFor(outsideBlock, nullptr);
if (outsideBlockLoop->contains(loop)) {
// If our `loop` is nested within `outsideBlockLoop`. Move the node
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there test coverage on both this if branch, and the else below?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we have coverage for both. The if branch was handled even before this PR (and is covered by the example with labeled breaks).

The else branch is the new behavior and the tests in this PR provide coverage.

// from `outsideBlockLoop` into our `loop`.
outsideBlockLoop->removeBlockFromLoop(outsideBlock);
LI->changeLoopFor(outsideBlock, nullptr);
loop->addBasicBlockToLoop(outsideBlock, LI->getBase());
} else {
// We should only nest `outsideBlockLoop` into our `loop` when we
// process the very first node of the `outsideBlockLoop`. Check that we
// have not already nested the `outsideBlockLoop` into our `loop`.
if (!loop->contains(outsideBlockLoop)) {
// Not yet nested, adjust the LoopInfo w.r.t nesting.
if (outsideBlockLoop->getParentLoop() == nullptr) {
// Remove from top-level loops as we are nesting it in `loop`.
LI->removeLoop(llvm::find(*LI, outsideBlockLoop));
}
loop->addChildLoop(outsideBlockLoop);
}
// Add the block to this loop and all its parents.
auto *L = loop;
while (L) {
L->addBlockEntry(outsideBlock);
L = L->getParentLoop();
}
}
// top-level loop is already correct.
}
loop->addBasicBlockToLoop(outsideBlock, LI->getBase());
}
if (cloner.hasCloned()) {
// TODO(https://bugs.swift.org/browse/SR-8336): the transformations here are
Expand Down Expand Up @@ -1283,13 +1379,8 @@ void SESERegionBuilder::ensureSingleExitFromLoops() {
}
continue;
}
SingleExitLoopTransformer transformer(&deviceInfo, &LI, &DI, loop, &PDI);
bool loopChanged = transformer.transform();
if (loopChanged) {
// Recalculate dominator information as it is stale now.
DI.recalculate(*F);
PDI.recalculate(*F);
}
bool loopChanged =
SingleExitLoopTransformer::doIt(&deviceInfo, &LI, &DI, loop, &PDI);
changed |= loopChanged;
}
if (changed) {
Expand Down Expand Up @@ -1414,7 +1505,7 @@ void SingleExitLoopTransformer::unrollLoopBodyOnce() {
}

// Clone everything starting from the old header.
cloner.cloneLoop(LI, loop, header);
cloner.cloneOrUnrollLoop(LI, loop, header);

// Get the clone for old header.
SILBasicBlock *clonedOldHeader = cloner.remapBasicBlock(header);
Expand Down
117 changes: 107 additions & 10 deletions test/TensorFlow/sese_loop_canonicalization.sil
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
// RUN: %target-sil-opt -tf-xla-cfg-canonicalize -tf-ensure-single-loop-exit -assume-parsing-unqualified-ownership-sil %s -o /dev/null | %FileCheck %s
// RUN: %target-sil-opt -tf-dump-intermediates -tf-xla-cfg-canonicalize -tf-ensure-single-loop-exit -assume-parsing-unqualified-ownership-sil %s -o /dev/null | %FileCheck %s

import Builtin
import Swift
Expand Down Expand Up @@ -294,17 +294,17 @@ bb3 (%9 : $Builtin.Int32):
// }
// The CFG is shown below:
//
// [0]-----+
// [0]-------+
// | \
// [1] [2]
// | |
// | +--- [4]
// | | / \
// | | [6] [5]
// | +/ \ |
// [3]----[8] |
// | |
// +-->[9]<------+
// | |
// | +--->[4]
// | [7] / \
// | | [6] [5]
// V +-+ \ |
// [3]<-----[8] |
// | |
// +-->[9]<-----+
//
// This is an example where the nodes that lead to the common post dominator of all exit nodes
// is also reachable from nodes outside of the loop. The key issue is that bb9 is the common
Expand Down Expand Up @@ -374,3 +374,100 @@ bb8:
bb9(%18 : $Builtin.Int32):
return %18 : $Builtin.Int32
}

// The following example is similar to loopThatRequiresNodeCloning, where the
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

given the swift level test in the next file below, what is the value of having this SIL level test?

I feel it could take non-trivial work to read and maintain such tests.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is representing a case where we need to move an outside loop into the currently processed loop. However, the header of the outside loop is not dominated by the currently processed loop. In this case, we will have to clone the loop and make sure the LoopInfo is updated.

We want a very specific CFG to test this scenario. See the ASCII graph in the test file for the CFG. With a swift file, the generated CFG can change depending upon the version of the compiler. (Also, the compiler did generate a similar CFG for an example.)

// nodes that lead to the common post dominator of all exit nodes is also
// reachable from nodes outside of the loop. The only difference in this example
// is that some of these nodes also belong to a loop. Therefore, it is required
// for us to update the LoopInfo in addition to cloning. The loop in question
// consists of the single node bb3. The CFG is shown below:
//
// [0]-------+
// | \
// [1] [2]
// | |
// | +--->[4]
// | [7] / \
// | | [6] [5]
// V +--+ \ |
// +-->[3]<-----[8] |
// | / | |
// + +-->[9]<-----+
//
// CHECK-LABEL:--- XLA CFG Loops Before Canonicalize: $loopThatRequiresNodeAndLoopCloning
//----The following loop will be cloned in the second loop.
// CHECK:Loop at depth 1 containing: {{bb[0-9]+}}<header><latch><exiting>
// CHECK:Loop at depth 1 containing: {{bb[0-9]+}}<header><exiting>,{{bb[0-9]+}}<exiting>,{{bb[0-9]+}}<latch>

// CHECK-LABEL:--- XLA CFG Loops After Canonicalize: $loopThatRequiresNodeAndLoopCloning
// CHECK:Loop at depth 1 containing: [[L1HDR:bb[0-9]+]]<header><exiting>,[[L1LATCH:bb[0-9]+]]<latch>
// CHECK:Loop at depth 1 containing: [[L2HDR:bb[0-9]+]]<header><exiting>,{{.*}},[[L2LATCH:bb[0-9]+]]<latch>
//----Following is the clone of the first loop.
// CHECK: Loop at depth 2 containing: [[L3HDR:bb[0-9]+]]<header><exiting>,[[L3LATCH:bb[0-9]+]]<latch>

// CHECK-LABEL:--- XLA CFG Canonicalize: $loopThatRequiresNodeAndLoopCloning
// CHECK:[sequence
// CHECK: {condition Header: {{bb[0-9]+}}
// CHECK: [sequence
// CHECK: <while Preheader: {{bb[0-9]+}}, Header: [[L1HDR]], exit: {{bb[0-9]+}}
// CHECK: block [[L1LATCH]]>
// CHECK: block {{bb[0-9]+}}]
// CHECK: [sequence
// CHECK: <while Preheader: {{bb[0-9]+}}, Header: [[L2HDR]], exit: {{bb[0-9]+}}
// CHECK: [sequence
// CHECK: {condition Header: {{bb[0-9]+}}
// CHECK: block {{bb[0-9]+}}
// CHECK: {condition Header: {{bb[0-9]+}}
// CHECK: [sequence
// CHECK: <while Preheader: {{bb[0-9]+}}, Header: [[L3HDR]], exit: {{bb[0-9]+}}
// CHECK: block [[L3LATCH]]>
// CHECK: block {{bb[0-9]+}}]
// CHECK: block {{bb[0-9]+}}}}
// CHECK: block [[L2LATCH]]]>
// CHECK: block {{bb[0-9]+}}]}
// CHECK: block {{bb[0-9]+}}]

sil @$loopThatRequiresNodeAndLoopCloning : $@convention(thin) (Builtin.Int32, Builtin.Int32) -> Builtin.Int32 {
bb0(%0 : $Builtin.Int32, %1 : $Builtin.Int32):
%2 = integer_literal $Builtin.Int32, 1
%3 = integer_literal $Builtin.Int32, 100
%4 = builtin "sadd_with_overflow_Int32"(%0 : $Builtin.Int32, %2 : $Builtin.Int32) : $Builtin.Int32
%5 = builtin "cmp_slt_Int32"(%4 : $Builtin.Int32, %3 : $Builtin.Int32) : $Builtin.Int1
cond_br %5, bb1, bb2

bb1:
// First arg is sum, the second arg is loop counter.
br bb3(%4 : $Builtin.Int32, %0 : $Builtin.Int32)

bb2:
br bb4(%4 : $Builtin.Int32, %0 : $Builtin.Int32)

bb3(%6 : $Builtin.Int32, %7 : $Builtin.Int32):
%8 = builtin "ssub_with_overflow_Int32"(%6 : $Builtin.Int32, %7 : $Builtin.Int32) : $Builtin.Int32
%100 = builtin "cmp_slt_Int32"(%8 : $Builtin.Int32, %7 : $Builtin.Int32) : $Builtin.Int1
cond_br %100, bb3(%7: $Builtin.Int32, %8: $Builtin.Int32), bb9(%8 : $Builtin.Int32)

bb4(%9 : $Builtin.Int32, %10 : $Builtin.Int32):
%11 = builtin "sadd_with_overflow_Int32"(%10 : $Builtin.Int32, %2 : $Builtin.Int32) : $Builtin.Int32
%12 = builtin "sadd_with_overflow_Int32"(%9 : $Builtin.Int32, %10 : $Builtin.Int32) : $Builtin.Int32
%13 = builtin "cmp_slt_Int32"(%11 : $Builtin.Int32, %1 : $Builtin.Int32) : $Builtin.Int1
cond_br %13, bb5, bb6

bb5:
br bb9(%12 : $Builtin.Int32)

bb6:
%14 = builtin "sadd_with_overflow_Int32"(%11 : $Builtin.Int32, %2 : $Builtin.Int32) : $Builtin.Int32
%15 = builtin "sadd_with_overflow_Int32"(%12 : $Builtin.Int32, %14 : $Builtin.Int32) : $Builtin.Int32
%16 = builtin "cmp_slt_Int32"(%14 : $Builtin.Int32, %1 : $Builtin.Int32) : $Builtin.Int1
cond_br %16, bb8, bb7

bb7:
br bb4(%15 : $Builtin.Int32, %14 : $Builtin.Int32)

bb8:
br bb3(%15 : $Builtin.Int32, %14 : $Builtin.Int32)

bb9(%18 : $Builtin.Int32):
return %18 : $Builtin.Int32
}
Loading