Skip to content

Commit 89e82e3

Browse files
authored
[SYCL][Fusion] Scheduler support for kernel fusion (#7531)
This is the third patch in a series of patches to add an implementation of the [kernel fusion extension](#7098). We have split the implementation into multiple patches to make them more easy to review. This patch integrates the kernel fusion extension into the SYCL runtime scheduler. Next to collecting the kernels submitted while in fusion mode in the fusion list associated with the queue, the integration into the scheduler is also responsible for detecting the synchronization scenarios. Various scenarios, such as buffer destruction or event wait, require fusion to be aborted early. The full list of scenarios is available in the [extension proposal](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc#synchronization-in-the-sycl-application). A high-level description of the integration into the scheduler can be found in the [design document](#7204). This PR can be reviewed and merged independently of #7465. Signed-off-by: Lukas Sommer <[email protected]> Signed-off-by: Lukas Sommer <[email protected]>
1 parent ea44995 commit 89e82e3

File tree

13 files changed

+901
-82
lines changed

13 files changed

+901
-82
lines changed

sycl/source/detail/fusion/fusion_wrapper_impl.cpp

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
#include <detail/fusion/fusion_wrapper_impl.hpp>
1010

11+
#include <detail/scheduler/scheduler.hpp>
12+
1113
namespace sycl {
1214
__SYCL_INLINE_VER_NAMESPACE(_V1) {
1315
namespace detail {
@@ -20,22 +22,22 @@ std::shared_ptr<detail::queue_impl> fusion_wrapper_impl::get_queue() const {
2022
return MQueue;
2123
}
2224

23-
bool fusion_wrapper_impl::is_in_fusion_mode() const { return false; }
25+
bool fusion_wrapper_impl::is_in_fusion_mode() const {
26+
return MQueue->is_in_fusion_mode();
27+
}
2428

2529
void fusion_wrapper_impl::start_fusion() {
26-
throw sycl::exception(sycl::errc::feature_not_supported,
27-
"Fusion not yet implemented");
30+
detail::Scheduler::getInstance().startFusion(MQueue);
2831
}
2932

3033
void fusion_wrapper_impl::cancel_fusion() {
31-
throw sycl::exception(sycl::errc::feature_not_supported,
32-
"Fusion not yet implemented");
34+
detail::Scheduler::getInstance().cancelFusion(MQueue);
3335
}
3436

3537
event fusion_wrapper_impl::complete_fusion(const property_list &PropList) {
36-
(void)PropList;
37-
throw sycl::exception(sycl::errc::feature_not_supported,
38-
"Fusion not yet implemented");
38+
auto EventImpl =
39+
detail::Scheduler::getInstance().completeFusion(MQueue, PropList);
40+
return detail::createSyclObjFromImpl<event>(EventImpl);
3941
}
4042

4143
} // namespace detail

sycl/source/detail/helpers.cpp

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
//
77
//===----------------------------------------------------------------------===//
88

9+
#include <detail/scheduler/commands.hpp>
910
#include <sycl/detail/helpers.hpp>
1011

1112
#include <detail/context_impl.hpp>
@@ -30,9 +31,16 @@ std::vector<RT::PiEvent> getOrWaitEvents(std::vector<sycl::event> DepEvents,
3031
!SyclEventImplPtr->is_host()) {
3132
continue;
3233
}
34+
// The fusion command and its event are associated with a non-host context,
35+
// but still does not produce a PI event.
36+
bool NoPiEvent =
37+
SyclEventImplPtr->MCommand &&
38+
!static_cast<Command *>(SyclEventImplPtr->MCommand)->producesPiEvent();
3339
if (SyclEventImplPtr->is_host() ||
34-
SyclEventImplPtr->getContextImpl() != Context) {
35-
SyclEventImplPtr->waitInternal();
40+
SyclEventImplPtr->getContextImpl() != Context || NoPiEvent) {
41+
// Call wait, because the command for the event might not have been
42+
// enqueued when kernel fusion is happening.
43+
SyclEventImplPtr->wait(SyclEventImplPtr);
3644
} else {
3745
Events.push_back(SyclEventImplPtr->getHandleRef());
3846
}

sycl/source/detail/queue_impl.hpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,15 @@ class queue_impl {
498498

499499
bool ext_oneapi_empty() const;
500500

501+
/// Check whether the queue is in fusion mode.
502+
///
503+
/// \return true if the queue is in fusion mode, false otherwise.
504+
bool is_in_fusion_mode() {
505+
return detail::Scheduler::getInstance().isInFusionMode(
506+
std::hash<typename std::shared_ptr<queue_impl>::element_type *>()(
507+
this));
508+
}
509+
501510
protected:
502511
// template is needed for proper unit testing
503512
template <typename HandlerType = handler>

sycl/source/detail/scheduler/commands.cpp

Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,8 @@ static std::string commandToNodeType(Command::CommandType Type) {
168168
return "host_acc_create_buffer_lock_node";
169169
case Command::CommandType::EMPTY_TASK:
170170
return "host_acc_destroy_buffer_release_node";
171+
case Command::CommandType::FUSION:
172+
return "kernel_fusion_placeholder_node";
171173
default:
172174
return "unknown_node";
173175
}
@@ -196,6 +198,8 @@ static std::string commandToName(Command::CommandType Type) {
196198
return "Host Accessor Creation/Buffer Lock";
197199
case Command::CommandType::EMPTY_TASK:
198200
return "Host Accessor Destruction/Buffer Lock Release";
201+
case Command::CommandType::FUSION:
202+
return "Kernel Fusion Placeholder";
199203
default:
200204
return "Unknown Action";
201205
}
@@ -2586,6 +2590,125 @@ bool ExecCGCommand::readyForCleanup() const {
25862590
return MLeafCounter == 0 && MEvent->isCompleted();
25872591
return Command::readyForCleanup();
25882592
}
2593+
2594+
KernelFusionCommand::KernelFusionCommand(QueueImplPtr Queue)
2595+
: Command(Command::CommandType::FUSION, Queue),
2596+
MStatus(FusionStatus::ACTIVE) {
2597+
emitInstrumentationDataProxy();
2598+
}
2599+
2600+
std::vector<Command *> &KernelFusionCommand::auxiliaryCommands() {
2601+
return MAuxiliaryCommands;
2602+
}
2603+
2604+
void KernelFusionCommand::addToFusionList(ExecCGCommand *Kernel) {
2605+
MFusionList.push_back(Kernel);
2606+
}
2607+
2608+
std::vector<ExecCGCommand *> &KernelFusionCommand::getFusionList() {
2609+
return MFusionList;
2610+
}
2611+
2612+
bool KernelFusionCommand::producesPiEvent() const { return false; }
2613+
2614+
pi_int32 KernelFusionCommand::enqueueImp() {
2615+
waitForPreparedHostEvents();
2616+
waitForEvents(MQueue, MPreparedDepsEvents, MEvent->getHandleRef());
2617+
2618+
return PI_SUCCESS;
2619+
}
2620+
2621+
void KernelFusionCommand::setFusionStatus(FusionStatus Status) {
2622+
MStatus = Status;
2623+
}
2624+
2625+
void KernelFusionCommand::emitInstrumentationData() {
2626+
#ifdef XPTI_ENABLE_INSTRUMENTATION
2627+
if (!xptiTraceEnabled()) {
2628+
return;
2629+
}
2630+
// Create a payload with the command name and an event using this payload to
2631+
// emit a node_create
2632+
MCommandNodeType = commandToNodeType(MType);
2633+
MCommandName = commandToName(MType);
2634+
2635+
static unsigned FusionNodeCount = 0;
2636+
std::stringstream PayloadStr;
2637+
PayloadStr << "Fusion command #" << FusionNodeCount++;
2638+
xpti::payload_t Payload = xpti::payload_t(PayloadStr.str().c_str());
2639+
2640+
uint64_t CommandInstanceNo = 0;
2641+
xpti_td *CmdTraceEvent =
2642+
xptiMakeEvent(MCommandName.c_str(), &Payload, xpti::trace_graph_event,
2643+
xpti_at::active, &CommandInstanceNo);
2644+
2645+
MInstanceID = CommandInstanceNo;
2646+
if (CmdTraceEvent) {
2647+
MTraceEvent = static_cast<void *>(CmdTraceEvent);
2648+
// If we are seeing this event again, then the instance ID
2649+
// will be greater
2650+
// than 1; in this case, we must skip sending a
2651+
// notification to create a node as this node has already
2652+
// been created. We return this value so the epilog method
2653+
// can be called selectively.
2654+
// See makeTraceEventProlog.
2655+
MFirstInstance = (CommandInstanceNo == 1);
2656+
}
2657+
2658+
// This function is called in the constructor of the command. At this point
2659+
// the kernel fusion list is still empty, so we don't have a terrible lot of
2660+
// information we could attach to this node here.
2661+
if (MFirstInstance && CmdTraceEvent) {
2662+
xpti::addMetadata(CmdTraceEvent, "sycl_device",
2663+
deviceToID(MQueue->get_device()));
2664+
xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
2665+
deviceToString(MQueue->get_device()));
2666+
xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
2667+
getSyclObjImpl(MQueue->get_device())->getDeviceName());
2668+
}
2669+
2670+
if (MFirstInstance) {
2671+
xptiNotifySubscribers(MStreamID, xpti::trace_node_create,
2672+
detail::GSYCLGraphEvent,
2673+
static_cast<xpti_td *>(MTraceEvent), MInstanceID,
2674+
static_cast<const void *>(MCommandNodeType.c_str()));
2675+
}
2676+
2677+
#endif
2678+
}
2679+
2680+
void KernelFusionCommand::printDot(std::ostream &Stream) const {
2681+
Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
2682+
2683+
Stream << "ID = " << this << "\\n";
2684+
Stream << "KERNEL FUSION on " << deviceToString(MQueue->get_device()) << "\\n"
2685+
<< "FUSION LIST: {";
2686+
bool Initial = true;
2687+
for (auto *Cmd : MFusionList) {
2688+
if (!Initial) {
2689+
Stream << ",\\n";
2690+
}
2691+
Initial = false;
2692+
auto *KernelCG = static_cast<detail::CGExecKernel *>(&Cmd->getCG());
2693+
if (KernelCG->MSyclKernel && KernelCG->MSyclKernel->isCreatedFromSource()) {
2694+
Stream << "created from source";
2695+
} else {
2696+
Stream << demangleKernelName(KernelCG->getKernelName());
2697+
}
2698+
}
2699+
Stream << "}\\n";
2700+
2701+
Stream << "\"];" << std::endl;
2702+
2703+
for (const auto &Dep : MDeps) {
2704+
Stream << " \"" << this << "\" -> \"" << Dep.MDepCommand << "\""
2705+
<< " [ label = \"Access mode: "
2706+
<< accessModeToString(Dep.MDepRequirement->MAccessMode) << "\\n"
2707+
<< "MemObj: " << Dep.MDepRequirement->MSYCLMemObj << " \" ]"
2708+
<< std::endl;
2709+
}
2710+
}
2711+
25892712
} // namespace detail
25902713
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
25912714
} // namespace sycl

sycl/source/detail/scheduler/commands.hpp

Lines changed: 52 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,8 @@ class Command {
104104
UNMAP_MEM_OBJ,
105105
UPDATE_REQUIREMENT,
106106
EMPTY_TASK,
107-
HOST_TASK
107+
HOST_TASK,
108+
FUSION
108109
};
109110

110111
Command(CommandType Type, QueueImplPtr Queue);
@@ -571,6 +572,8 @@ pi_int32 enqueueImpKernel(
571572
std::vector<RT::PiEvent> &RawEvents, RT::PiEvent *OutEvent,
572573
const std::function<void *(Requirement *Req)> &getMemAllocationFunc);
573574

575+
class KernelFusionCommand;
576+
574577
/// The exec CG command enqueues execution of kernel or explicit memory
575578
/// operation.
576579
class ExecCGCommand : public Command {
@@ -586,6 +589,17 @@ class ExecCGCommand : public Command {
586589

587590
detail::CG &getCG() const { return *MCommandGroup; }
588591

592+
// MEmptyCmd is only employed if this command refers to host-task.
593+
// The mechanism of lookup for single EmptyCommand amongst users of
594+
// host-task-representing command is unreliable. This unreliability roots in
595+
// the cleanup process.
596+
EmptyCommand *MEmptyCmd = nullptr;
597+
598+
// MFusionCommand is employed to mark a CG command as part of a kernel fusion
599+
// and allows to refer back to the corresponding KernelFusionCommand if
600+
// necessary.
601+
KernelFusionCommand *MFusionCmd = nullptr;
602+
589603
bool producesPiEvent() const final;
590604

591605
bool supportsPostEnqueueCleanup() const final;
@@ -619,6 +633,43 @@ class UpdateHostRequirementCommand : public Command {
619633
void **MDstPtr = nullptr;
620634
};
621635

636+
/// The KernelFusionCommand is placed in the execution graph together with the
637+
/// individual kernels of the fusion list to control kernel fusion.
638+
class KernelFusionCommand : public Command {
639+
public:
640+
enum class FusionStatus { ACTIVE, CANCELLED, COMPLETE, DELETED };
641+
642+
explicit KernelFusionCommand(QueueImplPtr Queue);
643+
644+
void printDot(std::ostream &Stream) const final;
645+
void emitInstrumentationData() final;
646+
bool producesPiEvent() const final;
647+
648+
std::vector<Command *> &auxiliaryCommands();
649+
650+
void addToFusionList(ExecCGCommand *Kernel);
651+
652+
std::vector<ExecCGCommand *> &getFusionList();
653+
654+
///
655+
/// Set the status of this fusion command to \p Status. This function should
656+
/// only be called under the protection of the scheduler write-lock.
657+
void setFusionStatus(FusionStatus Status);
658+
659+
bool isActive() const { return MStatus == FusionStatus::ACTIVE; }
660+
661+
bool readyForDeletion() const { return MStatus == FusionStatus::DELETED; }
662+
663+
private:
664+
pi_int32 enqueueImp() final;
665+
666+
std::vector<ExecCGCommand *> MFusionList;
667+
668+
std::vector<Command *> MAuxiliaryCommands;
669+
670+
FusionStatus MStatus;
671+
};
672+
622673
} // namespace detail
623674
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
624675
} // namespace sycl

0 commit comments

Comments
 (0)