Skip to content

[SYCL] Proxy DLL Loader for Windows #7756

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
d726e1c
DLLs manually loaded by SYCL are not trackd as direct dependencies in…
cperkinsintel Dec 13, 2022
1457761
stray comment removed
cperkinsintel Dec 13, 2022
052afd7
DLL_PROCESS_DETACH added to sycl_pi_trace
cperkinsintel Dec 19, 2022
c723c45
clang-format
cperkinsintel Dec 19, 2022
5b70c21
yet more clang-format
cperkinsintel Dec 20, 2022
c648c09
Merge branch 'sycl' into cperkins-win_proxy_loader
cperkinsintel Dec 21, 2022
a935f29
Merge branch 'sycl' into cperkins-win_proxy_loader
cperkinsintel Dec 23, 2022
defd2d7
unified runtime and error suppression
cperkinsintel Dec 23, 2022
06fdb87
clang-format for not the last time
cperkinsintel Dec 23, 2022
454f4c8
moar clang-format
cperkinsintel Dec 23, 2022
1064627
Merge branch 'sycl' into cperkins-win_proxy_loader
cperkinsintel Jan 2, 2023
731a83d
restoring static default context tracker. Hip seems to need it.
cperkinsintel Jan 4, 2023
a1075d9
Merge branch 'sycl' into cperkins-win_proxy_loader
cperkinsintel Jan 5, 2023
1753fd5
when using XPTI we re-encounter dll unload issues. I believe I've eli…
cperkinsintel Jan 6, 2023
53aeee0
updated from sycl branch and resolved merge conflicts. Since we do no…
cperkinsintel Jan 12, 2023
a308dc3
not deferring fixes a lot of tests/bugs
cperkinsintel Jan 13, 2023
e30812f
Merge branch 'sycl' into cperkins-win_proxy_loader
cperkinsintel Jan 13, 2023
04da9a9
all tests but one passing
cperkinsintel Jan 13, 2023
af588a1
the fight with clang-format begins
cperkinsintel Jan 13, 2023
75ad94e
clang-format puts one up on the board
cperkinsintel Jan 14, 2023
b07d79d
spelling
cperkinsintel Jan 15, 2023
92756bd
ensuring win_proxy_loader is part of CMake install
cperkinsintel Jan 17, 2023
935e200
Merge branch 'sycl' into cperkins-win_proxy_loader
cperkinsintel Jan 18, 2023
c6255fc
works locally. having trouble with CI
cperkinsintel Jan 19, 2023
08f77d8
reenable windows for participation in deferred mem release. Have made…
cperkinsintel Feb 1, 2023
6d2b5ea
resolve merge conflict
cperkinsintel Feb 1, 2023
4ecbd0a
clang-format
cperkinsintel Feb 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ add_custom_target( sycl-toolchain
DEPENDS sycl-runtime-libraries
sycl-compiler
sycl-ls
win_proxy_loader
${XPTIFW_LIBS}
COMMENT "Building SYCL compiler toolchain..."
)
Expand Down Expand Up @@ -341,6 +342,8 @@ add_subdirectory( plugins )

add_subdirectory(tools)

add_subdirectory(win_proxy_loader)

if(SYCL_INCLUDE_TESTS)
if(NOT LLVM_INCLUDE_TESTS)
message(FATAL_ERROR
Expand Down Expand Up @@ -383,6 +386,7 @@ set( SYCL_TOOLCHAIN_DEPLOY_COMPONENTS
sycl
libsycldevice
level-zero-sycl-dev
win_proxy_loader
${XPTIFW_LIBS}
${SYCL_TOOLCHAIN_DEPS}
)
Expand Down Expand Up @@ -450,6 +454,7 @@ if("esimd_emulator" IN_LIST SYCL_ENABLE_PLUGINS)
endif()
endif()


# Use it as fake dependency in order to force another command(s) to execute.
add_custom_command(OUTPUT __force_it
COMMAND "${CMAKE_COMMAND}" -E echo
Expand Down
2 changes: 2 additions & 0 deletions sycl/include/sycl/detail/pi.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ enum TraceLevel {
bool trace(TraceLevel level);

#ifdef __SYCL_RT_OS_WINDOWS
// these same constants are used by win_proxy_loader.dll
// if a plugin is added here, add it there as well.
#ifdef _MSC_VER
#define __SYCL_OPENCL_PLUGIN_NAME "pi_opencl.dll"
#define __SYCL_LEVEL_ZERO_PLUGIN_NAME "pi_level_zero.dll"
Expand Down
34 changes: 34 additions & 0 deletions sycl/plugins/common_win_pi_trace/common_win_pi_trace.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
// this .hpp is injected. Be sure to define __SYCL_PLUGIN_DLL_NAME before
// including
#ifdef _WIN32
#include <windows.h>
BOOL WINAPI DllMain(HINSTANCE hinstDLL, // handle to DLL module
DWORD fdwReason, // reason for calling function
LPVOID lpReserved) { // reserved

bool PrintPiTrace = false;
static const char *PiTrace = std::getenv("SYCL_PI_TRACE");
static const int PiTraceValue = PiTrace ? std::stoi(PiTrace) : 0;
if (PiTraceValue == -1 || PiTraceValue == 2) { // Means print all PI traces
PrintPiTrace = true;
}

// Perform actions based on the reason for calling.
switch (fdwReason) {
case DLL_PROCESS_DETACH:
if (PrintPiTrace)
std::cout << "---> DLL_PROCESS_DETACH " << __SYCL_PLUGIN_DLL_NAME << "\n"
<< std::endl;

break;
case DLL_PROCESS_ATTACH:
if (PrintPiTrace)
std::cout << "---> DLL_PROCESS_ATTACH " << __SYCL_PLUGIN_DLL_NAME << "\n"
<< std::endl;
case DLL_THREAD_ATTACH:
case DLL_THREAD_DETACH:
break;
}
return TRUE;
}
#endif // WIN32
6 changes: 6 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5862,6 +5862,12 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
return PI_SUCCESS;
}

#ifdef _WIN32
#define __SYCL_PLUGIN_DLL_NAME "pi_cuda.dll"
#include "../common_win_pi_trace/common_win_pi_trace.hpp"
#undef __SYCL_PLUGIN_DLL_NAME
#endif

} // extern "C"

CUevent _pi_platform::evBase_{nullptr};
6 changes: 6 additions & 0 deletions sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2102,4 +2102,10 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
return PI_SUCCESS;
}

#ifdef _WIN32
#define __SYCL_PLUGIN_DLL_NAME "pi_esimd_emulator.dll"
#include "../common_win_pi_trace/common_win_pi_trace.hpp"
#undef __SYCL_PLUGIN_DLL_NAME
#endif

} // extern C
6 changes: 6 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5510,6 +5510,12 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
return PI_SUCCESS;
}

#ifdef _WIN32
#define __SYCL_PLUGIN_DLL_NAME "pi_hip.dll"
#include "../common_win_pi_trace/common_win_pi_trace.hpp"
#undef __SYCL_PLUGIN_DLL_NAME
#endif

} // extern "C"

hipEvent_t _pi_platform::evBase_{nullptr};
6 changes: 6 additions & 0 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9438,4 +9438,10 @@ pi_result piGetDeviceAndHostTimer(pi_device Device, uint64_t *DeviceTime,
}
return PI_SUCCESS;
}

#ifdef _WIN32
#define __SYCL_PLUGIN_DLL_NAME "pi_level_zero.dll"
#include "../common_win_pi_trace/common_win_pi_trace.hpp"
#undef __SYCL_PLUGIN_DLL_NAME
#endif
} // extern "C"
6 changes: 6 additions & 0 deletions sycl/plugins/opencl/pi_opencl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1941,4 +1941,10 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
return PI_SUCCESS;
}

#ifdef _WIN32
#define __SYCL_PLUGIN_DLL_NAME "pi_opencl.dll"
#include "../common_win_pi_trace/common_win_pi_trace.hpp"
#undef __SYCL_PLUGIN_DLL_NAME
#endif

} // end extern 'C'
11 changes: 11 additions & 0 deletions sycl/source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ if (SYCL_ENABLE_XPTI_TRACING)
include_directories(${LLVM_EXTERNAL_XPTI_SOURCE_DIR}/include)
endif()


function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME)
# Add an optional argument so we can get the library name to
# link with for Windows Debug version
Expand Down Expand Up @@ -53,6 +54,14 @@ function(add_sycl_rt_library LIB_NAME LIB_OBJ_NAME)
target_link_libraries(${LIB_NAME} PRIVATE ${ARG_XPTI_LIB})
endif()

# win_proxy_loader
include_directories(${LLVM_EXTERNAL_SYCL_SOURCE_DIR}/win_proxy_loader)
if(WIN_DUPE)
target_link_libraries(${LIB_NAME} PUBLIC win_proxy_loaderd)
else()
target_link_libraries(${LIB_NAME} PUBLIC win_proxy_loader)
endif()

target_compile_definitions(${LIB_OBJ_NAME} PRIVATE __SYCL_INTERNAL_API )

if (WIN32)
Expand Down Expand Up @@ -215,11 +224,13 @@ if (MSVC)
string(REGEX REPLACE "/MT" "" ${flag_var} "${${flag_var}}")
endforeach()

set(WIN_DUPE "1")
if (SYCL_ENABLE_XPTI_TRACING)
add_sycl_rt_library(sycl${SYCL_MAJOR_VERSION}d sycld_object XPTI_LIB xptid COMPILE_OPTIONS "/MDd" SOURCES ${SYCL_SOURCES})
else()
add_sycl_rt_library(sycl${SYCL_MAJOR_VERSION}d sycld_object COMPILE_OPTIONS "/MDd" SOURCES ${SYCL_SOURCES})
endif()
unset(WIN_DUPE)
add_library(sycld ALIAS sycl${SYCL_MAJOR_VERSION}d)

set(SYCL_EXTRA_OPTS "/MD")
Expand Down
2 changes: 1 addition & 1 deletion sycl/source/detail/context_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ context_impl::~context_impl() {
}
if (!MHostContext) {
// TODO catch an exception and put it to list of asynchronous exceptions
getPlugin().call<PiApiKind::piContextRelease>(MContext);
getPlugin().call_nocheck<PiApiKind::piContextRelease>(MContext);
}
}

Expand Down
7 changes: 7 additions & 0 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,13 @@ void event_impl::waitInternal() {
"waitInternal method cannot be used for a discarded event.");
} else if (MState != HES_Complete) {
// Wait for the host event
#ifdef _WIN32
// during shutdown it's possible that outstanding threads on win may
// be terminated, in which case the NotifyHostTaskComplete will not be
// called and the cv.wait() below will hang forever.
if (Scheduler::getInstance().isShuttingDown)
return;
#endif
std::unique_lock<std::mutex> lock(MMutex);
cv.wait(lock, [this] { return MState == HES_Complete; });
}
Expand Down
41 changes: 28 additions & 13 deletions sycl/source/detail/global_handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,10 @@ class ObjectUsageCounter {
MCounter++;
}
~ObjectUsageCounter() {
#if defined(XPTI_ENABLE_INSTRUMENTATION) && defined(_WIN32)
if (xptiTraceEnabled())
return; // When xpti tracing, can't safely perform some shutdown ops.
#endif
if (!MModifyCounter)
return;

Expand Down Expand Up @@ -165,20 +169,14 @@ void GlobalHandler::releaseDefaultContexts() {
// finished. To avoid calls to nowhere, intentionally leak platform to device
// cache. This will prevent destructors from being called, thus no PI cleanup
// routines will be called in the end.
// Update: the win_proxy_loader addresses this for SYCL's own dependencies,
// but the GPU device dlls seem to manually load yet another DLL which may
// have been released when this function is called. So we still release() and
// leak until that is addressed. DefaultContext destructs fine on CPU device.
MPlatformToDefaultContextCache.Inst.release();
#endif
}

struct DefaultContextReleaseHandler {
~DefaultContextReleaseHandler() {
GlobalHandler::instance().releaseDefaultContexts();
}
};

void GlobalHandler::registerDefaultContextReleaseHandler() {
static DefaultContextReleaseHandler handler{};
}

// Note: Split from shutdown so it is available to the unittests for ensuring
// that the mock plugin is the lone plugin.
void GlobalHandler::unloadPlugins() {
Expand All @@ -202,9 +200,9 @@ void GlobalHandler::unloadPlugins() {
void GlobalHandler::prepareSchedulerToRelease() {
#ifndef _WIN32
drainThreadPool();
#endif
if (MScheduler.Inst)
MScheduler.Inst->releaseResources();
#endif
}

void GlobalHandler::drainThreadPool() {
Expand Down Expand Up @@ -251,13 +249,30 @@ void shutdown() {
extern "C" __SYCL_EXPORT BOOL WINAPI DllMain(HINSTANCE hinstDLL,
DWORD fdwReason,
LPVOID lpReserved) {
bool PrintPiTrace = false;
static const char *PiTrace = std::getenv("SYCL_PI_TRACE");
static const int PiTraceValue = PiTrace ? std::stoi(PiTrace) : 0;
if (PiTraceValue == -1 || PiTraceValue == 2) { // Means print all PI traces
PrintPiTrace = true;
}

// Perform actions based on the reason for calling.
switch (fdwReason) {
case DLL_PROCESS_DETACH:
if (!lpReserved)
shutdown();
if (PrintPiTrace)
std::cout << "---> DLL_PROCESS_DETACH syclx.dll\n" << std::endl;

#ifdef XPTI_ENABLE_INSTRUMENTATION
if (xptiTraceEnabled())
return TRUE; // When doing xpti tracing, we can't safely call shutdown.
// TODO: figure out what XPTI is doing that prevents release.
#endif

shutdown();
break;
case DLL_PROCESS_ATTACH:
if (PrintPiTrace)
std::cout << "---> DLL_PROCESS_ATTACH syclx.dll\n" << std::endl;
case DLL_THREAD_ATTACH:
case DLL_THREAD_DETACH:
break;
Expand Down
2 changes: 2 additions & 0 deletions sycl/source/detail/os_util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,8 @@ OSModuleHandle OSUtil::getOSModuleHandle(const void *VirtAddr) {
}

/// Returns an absolute path where the object was found.
// win_proxy_loader.dll uses this same logic. If it is changed
// significantly, it might be wise to change it there too.
std::string OSUtil::getCurrentDSODir() {
char Path[MAX_PATH];
Path[0] = '\0';
Expand Down
10 changes: 0 additions & 10 deletions sycl/source/detail/platform_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,16 +137,6 @@ std::vector<platform> platform_impl::get_platforms() {
}
}

// Register default context release handler after plugins have been loaded and
// after the first calls to each plugin. This initializes a function-local
// variable that should be destroyed before any global variables in the
// plugins are destroyed. This is done after the first call to the backends to
// ensure any lazy-loaded dependencies are loaded prior to the handler
// variable's initialization. Note: The default context release handler is not
// guaranteed to be destroyed before function-local static variables as they
// may be initialized after.
GlobalHandler::registerDefaultContextReleaseHandler();

return Platforms;
}

Expand Down
16 changes: 13 additions & 3 deletions sycl/source/detail/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -392,21 +392,29 @@ Scheduler::Scheduler() {
Scheduler::~Scheduler() { DefaultHostQueue.reset(); }

void Scheduler::releaseResources() {

BlockingT blockValue = BlockingT::BLOCKING;

// There might be some commands scheduled for post enqueue cleanup that
// haven't been freed because of the graph mutex being locked at the time,
// clean them up now.
cleanupCommands({});

cleanupAuxiliaryResources(BlockingT::BLOCKING);
cleanupAuxiliaryResources(blockValue);

// We need loop since sometimes we may need new objects to be added to
// deferred mem objects storage during cleanup. Known example is: we cleanup
// existing deferred mem objects under write lock, during this process we
// cleanup commands related to this record, command may have last reference to
// queue_impl, ~queue_impl is called and buffer for assert (which is created
// with size only so all confitions for deferred release are satisfied) is
// with size only so all conditions for deferred release are satisfied) is
// added to deferred mem obj storage. So we may end up with leak.
// Windows: once we are shutting down, we can't rely on thread completion.
// So we clean up the deferred mem objects, but don't loop.
#ifndef _WIN32
while (!isDeferredMemObjectsEmpty())
cleanupDeferredMemObjects(BlockingT::BLOCKING);
#endif
cleanupDeferredMemObjects(blockValue);
}

MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) {
Expand Down Expand Up @@ -492,13 +500,15 @@ void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) {
if (isDeferredMemObjectsEmpty())
return;
if (Blocking == BlockingT::BLOCKING) {
this->isShuttingDown = true;
std::vector<std::shared_ptr<SYCLMemObjI>> TempStorage;
{
std::lock_guard<std::mutex> LockDef{MDeferredMemReleaseMutex};
MDeferredMemObjRelease.swap(TempStorage);
}
// if any objects in TempStorage exist - it is leaving scope and being
// deleted
return;
}

std::vector<std::shared_ptr<SYCLMemObjI>> ObjsReadyToRelease;
Expand Down
1 change: 1 addition & 0 deletions sycl/source/detail/scheduler/scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -499,6 +499,7 @@ class Scheduler {

// May lock graph with read and write modes during execution.
void cleanupDeferredMemObjects(BlockingT Blocking);
bool isShuttingDown = false;

// POD struct to convey some additional information from GraphBuilder::addCG
// to the Scheduler to support kernel fusion.
Expand Down
22 changes: 5 additions & 17 deletions sycl/source/detail/windows_pi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,29 +13,17 @@
#include <windows.h>
#include <winreg.h>

#include "win_proxy_loader.hpp"

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {
namespace detail {
namespace pi {

void *loadOsLibrary(const std::string &PluginPath) {
// Tells the system to not display the critical-error-handler message box.
// Instead, the system sends the error to the calling process.
// This is crucial for graceful handling of plugins that couldn't be
// loaded, e.g. due to missing native run-times.
// TODO: add reporting in case of an error.
// NOTE: we restore the old mode to not affect user app behavior.
//
UINT SavedMode = SetErrorMode(SEM_FAILCRITICALERRORS);
// Exclude current directory from DLL search path
if (!SetDllDirectoryA("")) {
assert(false && "Failed to update DLL search path");
}
auto Result = (void *)LoadLibraryA(PluginPath.c_str());
(void)SetErrorMode(SavedMode);
if (!SetDllDirectoryA(nullptr)) {
assert(false && "Failed to restore DLL search path");
}
// We fetch the preloaded plugin from the win_proxy_loader.
// The proxy_loader handles any required error suppression.
auto Result = getPreloadedPlugin(PluginPath);

return Result;
}
Expand Down
Loading