Skip to content

Commit 36be9d0

Browse files
[SYCL] fix for tensorflow, linux only (#14153)
On Linux, the TensorFlow library (which uses SYCL .so) is having a problem wherein its own releasing of resource is running afoul ours. The solution is to stop deferring any new memory releases once we know we are shutting down. To that end I've broken the shutdown into two parts (early and late) and added a flag for whether its OK to defer memory or not. This has been confirmed to fix the problem. It is not entirely clear to me what, exactly, the underlying cause of the problem is on the TensorFlow side. But I realized that while SYCL is used by several intermediate libs we have no testing of this scenario, so I've begun to add some. Lastly, on Windows, I've got encouraging results that this approach may help us unify shutdown so it's the same on both Linux and Win, but that is a larger work and should be in its own PR.
1 parent f43e8c4 commit 36be9d0

File tree

8 files changed

+225
-26
lines changed

8 files changed

+225
-26
lines changed

sycl/source/detail/global_handler.cpp

Lines changed: 34 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ namespace detail {
3636
using LockGuard = std::lock_guard<SpinLock>;
3737
SpinLock GlobalHandler::MSyclGlobalHandlerProtector{};
3838

39+
// forward decl
40+
void shutdown_win(); // TODO: win variant will go away soon
41+
void shutdown_early();
42+
void shutdown_late();
43+
3944
// Utility class to track references on object.
4045
// Used for GlobalHandler now and created as thread_local object on the first
4146
// Scheduler usage. Origin idea is to track usage of Scheduler from main and
@@ -227,16 +232,25 @@ void GlobalHandler::releaseDefaultContexts() {
227232
MPlatformToDefaultContextCache.Inst.reset(nullptr);
228233
}
229234

230-
struct DefaultContextReleaseHandler {
231-
~DefaultContextReleaseHandler() {
235+
struct EarlyShutdownHandler {
236+
~EarlyShutdownHandler() {
237+
#ifdef _WIN32
238+
// on Windows we keep to the existing shutdown procedure
232239
GlobalHandler::instance().releaseDefaultContexts();
240+
#else
241+
shutdown_early();
242+
#endif
233243
}
234244
};
235245

236-
void GlobalHandler::registerDefaultContextReleaseHandler() {
237-
static DefaultContextReleaseHandler handler{};
246+
void GlobalHandler::registerEarlyShutdownHandler() {
247+
static EarlyShutdownHandler handler{};
238248
}
239249

250+
bool GlobalHandler::isOkToDefer() const { return OkToDefer; }
251+
252+
void GlobalHandler::endDeferredRelease() { OkToDefer = false; }
253+
240254
// Note: Split from shutdown so it is available to the unittests for ensuring
241255
// that the mock plugin is the lone plugin.
242256
void GlobalHandler::unloadPlugins() {
@@ -279,30 +293,37 @@ void GlobalHandler::drainThreadPool() {
279293
// itself is very aggressive about reclaiming memory. Thus,
280294
// we focus solely on unloading the plugins, so as to not
281295
// accidentally retain device handles. etc
282-
void shutdown() {
296+
void shutdown_win() {
283297
GlobalHandler *&Handler = GlobalHandler::getInstancePtr();
284298
Handler->unloadPlugins();
285299
}
286300
#else
287-
void shutdown() {
301+
void shutdown_early() {
288302
const LockGuard Lock{GlobalHandler::MSyclGlobalHandlerProtector};
289303
GlobalHandler *&Handler = GlobalHandler::getInstancePtr();
290304
if (!Handler)
291305
return;
292306

307+
// Now that we are shutting down, we will no longer defer MemObj releases.
308+
Handler->endDeferredRelease();
309+
293310
// Ensure neither host task is working so that no default context is accessed
294311
// upon its release
295312
Handler->prepareSchedulerToRelease(true);
296313

297314
if (Handler->MHostTaskThreadPool.Inst)
298315
Handler->MHostTaskThreadPool.Inst->finishAndWait();
299316

300-
// If default contexts are requested after the first default contexts have
301-
// been released there may be a new default context. These must be released
302-
// prior to closing the plugins.
303-
// Note: Releasing a default context here may cause failures in plugins with
304-
// global state as the global state may have been released.
317+
// This releases OUR reference to the default context, but
318+
// other may yet have refs
305319
Handler->releaseDefaultContexts();
320+
}
321+
322+
void shutdown_late() {
323+
const LockGuard Lock{GlobalHandler::MSyclGlobalHandlerProtector};
324+
GlobalHandler *&Handler = GlobalHandler::getInstancePtr();
325+
if (!Handler)
326+
return;
306327

307328
// First, release resources, that may access plugins.
308329
Handler->MPlatformCache.Inst.reset(nullptr);
@@ -345,7 +366,7 @@ extern "C" __SYCL_EXPORT BOOL WINAPI DllMain(HINSTANCE hinstDLL,
345366
// TODO: figure out what XPTI is doing that prevents release.
346367
#endif
347368

348-
shutdown();
369+
shutdown_win();
349370
break;
350371
case DLL_PROCESS_ATTACH:
351372
if (PrintPiTrace)
@@ -363,7 +384,7 @@ extern "C" __SYCL_EXPORT BOOL WINAPI DllMain(HINSTANCE hinstDLL,
363384
// destructors. Priorities 0-100 are reserved by the compiler. The priority
364385
// value 110 allows SYCL users to run their destructors after runtime library
365386
// deinitialization.
366-
__attribute__((destructor(110))) static void syclUnload() { shutdown(); }
387+
__attribute__((destructor(110))) static void syclUnload() { shutdown_late(); }
367388
#endif
368389
} // namespace detail
369390
} // namespace _V1

sycl/source/detail/global_handler.hpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,8 +73,10 @@ class GlobalHandler {
7373
XPTIRegistry &getXPTIRegistry();
7474
ThreadPool &getHostTaskThreadPool();
7575

76-
static void registerDefaultContextReleaseHandler();
76+
static void registerEarlyShutdownHandler();
7777

78+
bool isOkToDefer() const;
79+
void endDeferredRelease();
7880
void unloadPlugins();
7981
void releaseDefaultContexts();
8082
void drainThreadPool();
@@ -91,7 +93,11 @@ class GlobalHandler {
9193
void *GSYCLCallEvent = nullptr;
9294
#endif
9395

94-
friend void shutdown();
96+
bool OkToDefer = true;
97+
98+
friend void shutdown_win();
99+
friend void shutdown_early();
100+
friend void shutdown_late();
95101
friend class ObjectUsageCounter;
96102
static GlobalHandler *&getInstancePtr();
97103
static SpinLock MSyclGlobalHandlerProtector;

sycl/source/detail/platform_impl.cpp

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -193,15 +193,9 @@ std::vector<platform> platform_impl::get_platforms() {
193193
Platforms.push_back(Platform.first);
194194
}
195195

196-
// Register default context release handler after plugins have been loaded and
197-
// after the first calls to each plugin. This initializes a function-local
198-
// variable that should be destroyed before any global variables in the
199-
// plugins are destroyed. This is done after the first call to the backends to
200-
// ensure any lazy-loaded dependencies are loaded prior to the handler
201-
// variable's initialization. Note: The default context release handler is not
202-
// guaranteed to be destroyed before function-local static variables as they
203-
// may be initialized after.
204-
GlobalHandler::registerDefaultContextReleaseHandler();
196+
// This initializes a function-local variable whose destructor is invoked as
197+
// the SYCL shared library is first being unloaded.
198+
GlobalHandler::registerEarlyShutdownHandler();
205199

206200
return Platforms;
207201
}

sycl/source/detail/sycl_mem_obj_t.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,11 @@ void SYCLMemObjT::detachMemoryObject(
229229
(MInteropContext && !MInteropContext->isOwnedByRuntime());
230230

231231
if (MRecord && MRecord->MCurContext->isOwnedByRuntime() &&
232-
!InteropObjectsUsed && (!MHostPtrProvided || MIsInternal))
233-
Scheduler::getInstance().deferMemObjRelease(Self);
232+
!InteropObjectsUsed && (!MHostPtrProvided || MIsInternal)) {
233+
bool okToDefer = GlobalHandler::instance().isOkToDefer();
234+
if (okToDefer)
235+
Scheduler::getInstance().deferMemObjRelease(Self);
236+
}
234237
}
235238

236239
void SYCLMemObjT::handleWriteAccessorCreation() {
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
// compile to static lib
3+
clang++ -fsycl -c -fPIC -o simple_lib.o simple_lib.cpp
4+
5+
// compile to dynamic lib
6+
clang++ -fsycl -fPIC -shared -o simple_lib.so simple_lib.cpp
7+
8+
*/
9+
10+
#include "simple_lib.h"
11+
#include <sycl/detail/core.hpp>
12+
13+
const size_t BUFF_SIZE = 1;
14+
15+
class Delay {
16+
public:
17+
std::shared_ptr<sycl::buffer<int, 1>> sharedBuffer;
18+
19+
void release() {
20+
std::cout << "Delay.release()" << std::endl;
21+
sharedBuffer.reset();
22+
}
23+
24+
const sycl::buffer<int, 1> &getBuffer() {
25+
if (!sharedBuffer) {
26+
sharedBuffer = std::make_shared<sycl::buffer<int, 1>>(BUFF_SIZE);
27+
}
28+
return *sharedBuffer;
29+
}
30+
31+
Delay() : sharedBuffer(nullptr) {}
32+
~Delay() { release(); }
33+
};
34+
35+
#ifdef _WIN32
36+
static Delay theDelay;
37+
Delay *MyDelay = &theDelay;
38+
#else
39+
Delay *MyDelay = new Delay;
40+
41+
__attribute__((destructor(101))) static void Unload101() {
42+
std::cout << "lib unload - __attribute__((destructor(101)))" << std::endl;
43+
delete MyDelay;
44+
}
45+
#endif
46+
47+
EXPORTDECL int add_using_device(int a, int b) {
48+
sycl::queue q;
49+
sycl::buffer<int, 1> buf = MyDelay->getBuffer();
50+
q.submit([&](sycl::handler &cgh) {
51+
sycl::accessor acc(buf, cgh, sycl::write_only);
52+
53+
cgh.single_task([=] { acc[0] = a + b; });
54+
}).wait();
55+
56+
sycl::host_accessor acc(buf);
57+
return acc[0];
58+
}
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#ifndef SIMPLE_SYCL_LIB_H
2+
#define SIMPLE_SYCL_LIB_H
3+
4+
#ifdef _WIN32
5+
#define EXPORTDECL extern "C" __declspec(dllexport)
6+
#else
7+
#define EXPORTDECL extern "C"
8+
#endif
9+
10+
EXPORTDECL int add_using_device(int a, int b);
11+
12+
#endif
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// REQUIRES: level_zero && linux
2+
3+
// build shared library
4+
// RUN: %clangxx -fsycl -fPIC -shared -o %T/simple_lib.so %S/Inputs/simple_lib.cpp
5+
6+
// build app
7+
// RUN: %clangxx -DSO_PATH="%T/simple_lib.so" -o %t.out %s
8+
9+
// RUN: %{run} %t.out
10+
// RUN: env UR_L0_LEAKS_DEBUG=1 %{run} %t.out
11+
12+
// In these tests we are building an intermediate library which uses SYCL and an
13+
// app that employs that intermediate library, using both static and dynamic
14+
// linking, and delayed release. This is to test that release and shutdown are
15+
// working correctly.
16+
17+
/*
18+
//library
19+
clang++ -fsycl -fPIC -shared -o simple_lib.so Inputs/simple_lib.cpp
20+
21+
//app
22+
clang++ -DSO_PATH="simple_lib.so" -o dynamic_app.bin dynamic_app_linux.cpp
23+
24+
UR_L0_LEAKS_DEBUG=1 ./dynamic_app.bin
25+
26+
*/
27+
28+
#include "Inputs/simple_lib.h"
29+
#include <assert.h>
30+
#include <dlfcn.h>
31+
#include <iostream>
32+
33+
void *handle = nullptr;
34+
35+
__attribute__((destructor(101))) static void Unload101() {
36+
std::cout << "app unload - __attribute__((destructor(101)))" << std::endl;
37+
if (handle) {
38+
dlclose(handle);
39+
handle = nullptr;
40+
}
41+
}
42+
43+
#define STRINGIFY_HELPER(A) #A
44+
#define STRINGIFY(A) STRINGIFY_HELPER(A)
45+
#define SO_FNAME "" STRINGIFY(SO_PATH) ""
46+
47+
int main() {
48+
49+
handle = dlopen(SO_FNAME, RTLD_NOW);
50+
if (!handle) {
51+
std::cout << "failed to load" << std::endl;
52+
return 1;
53+
}
54+
55+
// Function pointer to the exported function
56+
int (*add_using_device)(int, int) =
57+
(int (*)(int, int))dlsym(handle, "add_using_device");
58+
if (!add_using_device) {
59+
std::cout << "failed to get function" << std::endl;
60+
return 2;
61+
}
62+
63+
int result = add_using_device(3, 4);
64+
std::cout << "Result: " << result << std::endl;
65+
assert(result == 7);
66+
67+
return 0;
68+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
// REQUIRES: level_zero && linux
2+
3+
// DEFINE: %{fPIC_flag} = %if windows %{%} %else %{-fPIC%}
4+
// build static library
5+
// RUN: %clangxx -fsycl -c %{fPIC_flag} -o simple_lib.o %S/Inputs/simple_lib.cpp
6+
7+
// build app
8+
// RUN: %clangxx -fsycl -o %t.out %s simple_lib.o
9+
10+
// RUN: %{run} %t.out
11+
// RUN: env UR_L0_LEAKS_DEBUG=1 %{run} %t.out
12+
13+
// In these tests we are building an intermediate library which uses SYCL and an
14+
// app that employs that intermediate library, using both static and dynamic
15+
// linking, and delayed release. This is to test that release and shutdown are
16+
// working correctly.
17+
18+
/*
19+
//library
20+
clang++ -fsycl -c -fPIC -o simple_lib.o Inputs/simple_lib.cpp
21+
22+
//app
23+
clang++ -fsycl -o static_app.bin static_app.cpp simple_lib.o
24+
25+
UR_L0_LEAKS_DEBUG=1 ./simple_app.bin
26+
27+
*/
28+
29+
#include "Inputs/simple_lib.h"
30+
#include <assert.h>
31+
#include <iostream>
32+
33+
int main() {
34+
int result = add_using_device(3, 4);
35+
std::cout << "result: " << result << std::endl;
36+
assert(result == 7);
37+
}

0 commit comments

Comments
 (0)