Skip to content

Commit a9c776f

Browse files
dm-vodopyanovSergey Kanaev
andauthored
[SYCL] Fix deadlock in ProgramManager class (#2131)
This patch fixes deadlock in sycl::detail::ProgramManager class caused by data race in usage of KernelProgramCache::MBuildCV condition variable. Even if BuildResult->State.load() is atomic, it must be modified under the mutex. The deadlock happends in situation when notification notify_all() is sent while MBuildCV is in the wait expression BUT not in the waiting state. The result is - the notification is lost. After that the thread goes to the waiting state and sleeps forever. BuildResult is protected by mutex now, so the notification will be sent only when the other thread in the waiting state. Signed-off-by: Dmitry Vodopyanov <[email protected]> Co-authored-by: Sergey Kanaev <[email protected]>
1 parent 14ba1ec commit a9c776f

File tree

2 files changed

+31
-23
lines changed

2 files changed

+31
-23
lines changed

sycl/source/detail/kernel_program_cache.hpp

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,25 @@ class KernelProgramCache {
4242
/// The pointer is not null if and only if the entity is usable.
4343
/// State of the entity is provided by the user of cache instance.
4444
/// Currently there is only a single user - ProgramManager class.
45-
template<typename T> struct BuildResult {
45+
template <typename T> struct BuildResult {
4646
std::atomic<T *> Ptr;
4747
std::atomic<int> State;
4848
BuildError Error;
4949

50+
/// Condition variable to signal that build result is ready.
51+
/// A per-object (i.e. kernel or program) condition variable is employed
52+
/// instead of global one in order to eliminate the following deadlock.
53+
/// A thread T1 awaiting for build result BR1 to be ready may be awakened by
54+
/// another thread (due to use of global condition variable), which made
55+
/// build result BR2 ready. Meanwhile, a thread which made build result BR1
56+
/// ready notifies everyone via a global condition variable and T1 will skip
57+
/// this notification as it's not in condition_variable::wait()'s wait cycle
58+
/// now. Now T1 goes to sleep again and will wait until either a spurious
59+
/// wake-up or another thread will wake it up.
60+
std::condition_variable MBuildCV;
61+
/// A mutex to be employed along with MBuildCV.
62+
std::mutex MBuildResultMutex;
63+
5064
BuildResult(T* P, int S) : Ptr{P}, State{S}, Error{"", 0} {}
5165
};
5266

@@ -59,14 +73,8 @@ class KernelProgramCache {
5973

6074
using PiKernelT = std::remove_pointer<RT::PiKernel>::type;
6175

62-
struct BuildResultKernel : public BuildResult<PiKernelT> {
63-
std::mutex MKernelMutex;
64-
65-
BuildResultKernel(PiKernelT *P, int S) : BuildResult(P, S) {}
66-
};
67-
6876
using PiKernelPtrT = std::atomic<PiKernelT *>;
69-
using KernelWithBuildStateT = BuildResultKernel;
77+
using KernelWithBuildStateT = BuildResult<PiKernelT>;
7078
using KernelByNameT = std::map<string_class, KernelWithBuildStateT>;
7179
using KernelCacheT = std::map<RT::PiProgram, KernelByNameT>;
7280

@@ -82,21 +90,21 @@ class KernelProgramCache {
8290
return {MKernelsPerProgramCache, MKernelsPerProgramCacheMutex};
8391
}
8492

85-
template <class Predicate> void waitUntilBuilt(Predicate Pred) const {
86-
std::unique_lock<std::mutex> Lock(MBuildCVMutex);
93+
template <typename T, class Predicate>
94+
void waitUntilBuilt(BuildResult<T> &BR, Predicate Pred) const {
95+
std::unique_lock<std::mutex> Lock(BR.MBuildResultMutex);
8796

88-
MBuildCV.wait(Lock, Pred);
97+
BR.MBuildCV.wait(Lock, Pred);
8998
}
9099

91-
void notifyAllBuild() const { MBuildCV.notify_all(); }
100+
template <typename T> void notifyAllBuild(BuildResult<T> &BR) const {
101+
BR.MBuildCV.notify_all();
102+
}
92103

93104
private:
94105
std::mutex MProgramCacheMutex;
95106
std::mutex MKernelsPerProgramCacheMutex;
96107

97-
mutable std::condition_variable MBuildCV;
98-
mutable std::mutex MBuildCVMutex;
99-
100108
ProgramCacheT MCachedPrograms;
101109
KernelCacheT MKernelsPerProgramCache;
102110
ContextPtr MParentContext;

sycl/source/detail/program_manager/program_manager.cpp

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ RetT *waitUntilBuilt(KernelProgramCache &Cache,
125125
KernelProgramCache::BuildResult<RetT> *BuildResult) {
126126
// any thread which will find nullptr in cache will wait until the pointer
127127
// is not null anymore
128-
Cache.waitUntilBuilt([BuildResult]() {
128+
Cache.waitUntilBuilt(*BuildResult, [BuildResult]() {
129129
int State = BuildResult->State.load();
130130

131131
return State == BS_Done || State == BS_Failed;
@@ -212,7 +212,7 @@ getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, AcquireFT &&Acquire,
212212

213213
BuildResult->State.store(BS_Done);
214214

215-
KPCache.notifyAllBuild();
215+
KPCache.notifyAllBuild(*BuildResult);
216216

217217
return BuildResult;
218218
} catch (const exception &Ex) {
@@ -221,13 +221,13 @@ getOrBuild(KernelProgramCache &KPCache, KeyT &&CacheKey, AcquireFT &&Acquire,
221221

222222
BuildResult->State.store(BS_Failed);
223223

224-
KPCache.notifyAllBuild();
224+
KPCache.notifyAllBuild(*BuildResult);
225225

226226
std::rethrow_exception(std::current_exception());
227227
} catch (...) {
228228
BuildResult->State.store(BS_Failed);
229229

230-
KPCache.notifyAllBuild();
230+
KPCache.notifyAllBuild(*BuildResult);
231231

232232
std::rethrow_exception(std::current_exception());
233233
}
@@ -445,10 +445,10 @@ ProgramManager::getOrCreateKernel(OSModuleHandle M, const context &Context,
445445
return Result;
446446
};
447447

448-
auto BuildResult = static_cast<KernelProgramCache::BuildResultKernel *>(
449-
getOrBuild<PiKernelT, invalid_object_error>(Cache, KernelName, AcquireF,
450-
GetF, BuildF));
451-
return std::make_pair(BuildResult->Ptr.load(), &(BuildResult->MKernelMutex));
448+
auto BuildResult = getOrBuild<PiKernelT, invalid_object_error>(
449+
Cache, KernelName, AcquireF, GetF, BuildF);
450+
return std::make_pair(BuildResult->Ptr.load(),
451+
&(BuildResult->MBuildResultMutex));
452452
}
453453

454454
RT::PiProgram

0 commit comments

Comments
 (0)