10
10
#include < sys/syscall.h>
11
11
#include < unistd.h>
12
12
13
+ #include < atomic>
14
+ #include < chrono>
15
+ #include < condition_variable>
13
16
#include < cstdint>
14
17
#include < cstdio>
15
18
#include < cstdlib>
16
19
#include < iostream>
20
+ #include < mutex>
17
21
18
22
#ifdef C10_ANDROID
19
23
#ifndef SYS_gettid
@@ -109,8 +113,9 @@ FatalSignalHandler::FatalSignalHandler()
109
113
: fatalSignalHandlersInstalled(false ),
110
114
fatalSignalReceived(false ),
111
115
fatalSignalName(" <UNKNOWN>" ),
112
- writingCond(PTHREAD_COND_INITIALIZER),
113
- writingMutex(PTHREAD_MUTEX_INITIALIZER) {}
116
+ writingCond(),
117
+ writingMutex(),
118
+ signalReceived(false ) {}
114
119
115
120
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays)
116
121
FatalSignalHandler::signal_handler FatalSignalHandler::kSignalHandlers [] = {
@@ -157,8 +162,10 @@ void FatalSignalHandler::callPreviousSignalHandler(
157
162
158
163
// needsLock signals whether we need to lock our writing mutex.
159
164
void FatalSignalHandler::stacktraceSignalHandler (bool needsLock) {
165
+ std::unique_lock<std::mutex> ul (writingMutex, std::defer_lock);
160
166
if (needsLock) {
161
- pthread_mutex_lock (&writingMutex);
167
+ ul.lock ();
168
+ signalReceived = true ;
162
169
}
163
170
pid_t tid = static_cast <pid_t >(syscall (SYS_gettid));
164
171
std::string backtrace = fmt::format (
@@ -170,8 +177,8 @@ void FatalSignalHandler::stacktraceSignalHandler(bool needsLock) {
170
177
c10::get_backtrace());
171
178
std::cerr << backtrace << std::endl;
172
179
if (needsLock) {
173
- pthread_mutex_unlock (&writingMutex );
174
- pthread_cond_signal (& writingCond);
180
+ ul. unlock ( );
181
+ writingCond. notify_all ( );
175
182
}
176
183
}
177
184
@@ -204,23 +211,32 @@ void FatalSignalHandler::fatalSignalHandler(int signum) {
204
211
pid_t pid = getpid ();
205
212
pid_t currentTid = static_cast <pid_t >(syscall (SYS_gettid));
206
213
struct dirent * entry = nullptr ;
207
- pthread_mutex_lock (& writingMutex);
214
+ std::unique_lock<std::mutex> ul ( writingMutex);
208
215
while ((entry = readdir (procDir)) != nullptr ) {
209
216
if (entry->d_name [0 ] == ' .' ) {
210
217
continue ;
211
218
}
212
219
pid_t tid = atoi (entry->d_name );
213
220
// If we've found the current thread then we'll jump into the SIGUSR2
214
- // handler before calling pthread_cond_wait thus deadlocking, so branch
215
- // our directly to the backtrace handler instead of signaling it.
221
+ // handler instead of signaling to avoid deadlocking.
216
222
if (tid != currentTid) {
223
+ signalReceived = false ;
217
224
syscall (SYS_tgkill, pid, tid, SIGUSR2);
218
- pthread_cond_wait (&writingCond, &writingMutex);
225
+ auto now = std::chrono::system_clock::now ();
226
+ using namespace std ::chrono_literals;
227
+ // we use wait_until instead of wait because on ROCm there was
228
+ // a single thread that wouldn't receive the SIGUSR2
229
+ if (std::cv_status::timeout == writingCond.wait_until (ul, now + 2s)) {
230
+ if (!signalReceived) {
231
+ std::cerr << " signal lost waiting for stacktrace " << pid << " :"
232
+ << tid << std::endl;
233
+ break ;
234
+ }
235
+ }
219
236
} else {
220
237
stacktraceSignalHandler (false );
221
238
}
222
239
}
223
- pthread_mutex_unlock (&writingMutex);
224
240
} else {
225
241
perror (" Failed to open /proc/self/task" );
226
242
}
0 commit comments