Skip to content

Commit f838fa8

Browse files
jeffreytan81jeffreytan81
andauthored
New ThreadPlanSingleThreadTimeout to resolve potential deadlock in single thread stepping (#90930)
This PR introduces a new `ThreadPlanSingleThreadTimeout` that will be used to address potential deadlock during single-thread stepping. While debugging a target with a non-trivial number of threads (around 5000 threads in one example target), we noticed that a simple step over can take as long as 10 seconds. Enabling single-thread stepping mode significantly reduces the stepping time to around 3 seconds. However, this can introduce deadlock if we try to step over a method that depends on other threads to release a lock. To address this issue, we introduce a new `ThreadPlanSingleThreadTimeout` that can be controlled by the `target.process.thread.single-thread-plan-timeout` setting during single-thread stepping mode. The concept involves counting the elapsed time since the last internal stop to detect overall stepping progress. Once a timeout occurs, we assume the target is not making progress due to a potential deadlock, as mentioned above. We then send a new async interrupt, resume all threads, and `ThreadPlanSingleThreadTimeout` completes its task. To support this design, the major changes made in this PR are: 1. `ThreadPlanSingleThreadTimeout` is popped during every internal stop and reset (re-pushed) to the top of the stack (as a leaf node) during resume. This is achieved by always returning `true` from `ThreadPlanSingleThreadTimeout::DoPlanExplainsStop()` and `ThreadPlanSingleThreadTimeout::MischiefManaged()`. 2. A new thread-specific async interrupt stop is introduced, which can be detected/consumed by `ThreadPlanSingleThreadTimeout`. 3. The clearing of branch breakpoints in the range thread plan has been moved from `DoPlanExplainsStop()` to `ShouldStop()`, as it is not guaranteed that it will be called. The detailed design is discussed in the RFC below: [https://discourse.llvm.org/t/improve-single-thread-stepping/74599](https://discourse.llvm.org/t/improve-single-thread-stepping/74599) --------- Co-authored-by: jeffreytan81 <[email protected]>
1 parent 84cc186 commit f838fa8

30 files changed

+974
-53
lines changed

lldb/include/lldb/Target/Process.h

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,10 +1314,16 @@ class Process : public std::enable_shared_from_this<Process>,
13141314

13151315
size_t GetThreadStatus(Stream &ostrm, bool only_threads_with_stop_reason,
13161316
uint32_t start_frame, uint32_t num_frames,
1317-
uint32_t num_frames_with_source,
1318-
bool stop_format);
1317+
uint32_t num_frames_with_source, bool stop_format);
13191318

1320-
void SendAsyncInterrupt();
1319+
/// Send an async interrupt request.
1320+
///
1321+
/// If \a thread is specified the async interrupt stop will be attributed to
1322+
/// the specified thread.
1323+
///
1324+
/// \param[in] thread
1325+
/// The thread the async interrupt will be attributed to.
1326+
void SendAsyncInterrupt(Thread *thread = nullptr);
13211327

13221328
// Notify this process class that modules got loaded.
13231329
//
@@ -2867,6 +2873,17 @@ void PruneThreadPlans();
28672873
return std::nullopt;
28682874
}
28692875

2876+
/// Handle thread specific async interrupt and return the original thread
2877+
/// that requested the async interrupt. It can be null if original thread
2878+
/// has exited.
2879+
///
2880+
/// \param[in] description
2881+
/// Returns the stop reason description of the async interrupt.
2882+
virtual lldb::ThreadSP
2883+
HandleThreadAsyncInterrupt(uint8_t signo, const std::string &description) {
2884+
return lldb::ThreadSP();
2885+
}
2886+
28702887
lldb::StateType GetPrivateState();
28712888

28722889
/// The "private" side of resuming a process. This doesn't alter the state
@@ -3153,6 +3170,11 @@ void PruneThreadPlans();
31533170
// Resume will only request a resume, using this
31543171
// flag to check.
31553172

3173+
lldb::tid_t m_interrupt_tid; /// The tid of the thread that issued the async
3174+
/// interrupt, used by thread plan timeout. It
3175+
/// can be LLDB_INVALID_THREAD_ID to indicate
3176+
/// user level async interrupt.
3177+
31563178
/// This is set at the beginning of Process::Finalize() to stop functions
31573179
/// from looking up or creating things during or after a finalize call.
31583180
std::atomic<bool> m_finalizing;

lldb/include/lldb/Target/StopInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,10 @@ class StopInfo : public std::enable_shared_from_this<StopInfo> {
123123
const char *description = nullptr,
124124
std::optional<int> code = std::nullopt);
125125

126+
static lldb::StopInfoSP
127+
CreateStopReasonWithInterrupt(Thread &thread, int signo,
128+
const char *description);
129+
126130
static lldb::StopInfoSP CreateStopReasonToTrace(Thread &thread);
127131

128132
static lldb::StopInfoSP

lldb/include/lldb/Target/Thread.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@ class ThreadProperties : public Properties {
5858
bool GetStepOutAvoidsNoDebug() const;
5959

6060
uint64_t GetMaxBacktraceDepth() const;
61+
62+
uint64_t GetSingleThreadPlanTimeout() const;
6163
};
6264

6365
class Thread : public std::enable_shared_from_this<Thread>,

lldb/include/lldb/Target/ThreadPlan.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,8 @@ class ThreadPlan : public std::enable_shared_from_this<ThreadPlan>,
302302
eKindStepInRange,
303303
eKindRunToAddress,
304304
eKindStepThrough,
305-
eKindStepUntil
305+
eKindStepUntil,
306+
eKindSingleThreadTimeout,
306307
};
307308

308309
virtual ~ThreadPlan();
@@ -395,6 +396,11 @@ class ThreadPlan : public std::enable_shared_from_this<ThreadPlan>,
395396

396397
bool IsControllingPlan() { return m_is_controlling_plan; }
397398

399+
// Returns true if this plan is a leaf plan, meaning the plan will be popped
400+
// during each stop if it does not explain the stop and re-pushed before
401+
// resuming to stay at the top of the stack.
402+
virtual bool IsLeafPlan() { return false; }
403+
398404
bool SetIsControllingPlan(bool value) {
399405
bool old_value = m_is_controlling_plan;
400406
m_is_controlling_plan = value;
@@ -483,6 +489,8 @@ class ThreadPlan : public std::enable_shared_from_this<ThreadPlan>,
483489
return m_takes_iteration_count;
484490
}
485491

492+
virtual lldb::StateType GetPlanRunState() = 0;
493+
486494
protected:
487495
// Constructors and Destructors
488496
ThreadPlan(ThreadPlanKind kind, const char *name, Thread &thread,
@@ -522,8 +530,6 @@ class ThreadPlan : public std::enable_shared_from_this<ThreadPlan>,
522530
GetThread().SetStopInfo(stop_reason_sp);
523531
}
524532

525-
virtual lldb::StateType GetPlanRunState() = 0;
526-
527533
bool IsUsuallyUnexplainedStopReason(lldb::StopReason);
528534

529535
Status m_status;
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
//===-- ThreadPlanSingleThreadTimeout.h -------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLDB_TARGET_THREADPLANSINGLETHREADTIMEOUT_H
10+
#define LLDB_TARGET_THREADPLANSINGLETHREADTIMEOUT_H
11+
12+
#include "lldb/Target/Thread.h"
13+
#include "lldb/Target/ThreadPlan.h"
14+
#include "lldb/Utility/Event.h"
15+
#include "lldb/Utility/LLDBLog.h"
16+
#include "lldb/Utility/State.h"
17+
18+
#include <chrono>
19+
#include <thread>
20+
21+
namespace lldb_private {
22+
23+
//
24+
// Thread plan used by single thread execution to issue timeout. This is useful
25+
// to detect potential deadlock in single thread execution. The timeout measures
26+
// the elapsed time from the last internal stop and gets reset by each internal
27+
// stop to ensure we are accurately detecting execution not moving forward.
28+
// This means this thread plan may be created/destroyed multiple times by the
29+
// parent execution plan.
30+
//
31+
// When a timeout happens, the thread plan resolves the potential deadlock by
32+
// issuing a thread specific async interrupt to enter stop state, then execution
33+
// is resumed with all threads running to resolve the potential deadlock
34+
//
35+
class ThreadPlanSingleThreadTimeout : public ThreadPlan {
36+
enum class State {
37+
WaitTimeout, // Waiting for timeout.
38+
AsyncInterrupt, // Async interrupt has been issued.
39+
Done, // Finished resume all threads.
40+
};
41+
42+
public:
43+
// TODO: allow timeout to be set on per thread plan basis.
44+
struct TimeoutInfo {
45+
// Whether there is a ThreadPlanSingleThreadTimeout instance alive.
46+
bool m_isAlive = false;
47+
ThreadPlanSingleThreadTimeout::State m_last_state = State::WaitTimeout;
48+
};
49+
50+
~ThreadPlanSingleThreadTimeout() override;
51+
52+
// If input \param thread is running in single thread mode, push a
53+
// new ThreadPlanSingleThreadTimeout based on timeout setting from fresh new
54+
// state. The reference of \param info is passed in so that when
55+
// ThreadPlanSingleThreadTimeout got popped its last state can be stored
56+
// in it for future resume.
57+
static void PushNewWithTimeout(Thread &thread, TimeoutInfo &info);
58+
59+
// Push a new ThreadPlanSingleThreadTimeout by restoring state from
60+
// input \param info and resume execution.
61+
static void ResumeFromPrevState(Thread &thread, TimeoutInfo &info);
62+
63+
void GetDescription(Stream *s, lldb::DescriptionLevel level) override;
64+
bool ValidatePlan(Stream *error) override { return true; }
65+
bool WillStop() override;
66+
void DidPop() override;
67+
68+
bool IsLeafPlan() override { return true; }
69+
bool DoPlanExplainsStop(Event *event_ptr) override;
70+
71+
lldb::StateType GetPlanRunState() override;
72+
static void TimeoutThreadFunc(ThreadPlanSingleThreadTimeout *self);
73+
74+
bool MischiefManaged() override;
75+
76+
bool ShouldStop(Event *event_ptr) override;
77+
void SetStopOthers(bool new_value) override;
78+
bool StopOthers() override;
79+
80+
private:
81+
ThreadPlanSingleThreadTimeout(Thread &thread, TimeoutInfo &info);
82+
83+
bool IsTimeoutAsyncInterrupt(Event *event_ptr);
84+
bool HandleEvent(Event *event_ptr);
85+
void HandleTimeout();
86+
uint64_t GetRemainingTimeoutMilliSeconds();
87+
88+
static std::string StateToString(State state);
89+
90+
ThreadPlanSingleThreadTimeout(const ThreadPlanSingleThreadTimeout &) = delete;
91+
const ThreadPlanSingleThreadTimeout &
92+
operator=(const ThreadPlanSingleThreadTimeout &) = delete;
93+
94+
TimeoutInfo &m_info; // Reference to controlling ThreadPlan's TimeoutInfo.
95+
State m_state;
96+
97+
// Lock for m_wakeup_cv and m_exit_flag between thread plan thread and timer
98+
// thread
99+
std::mutex m_mutex;
100+
std::condition_variable m_wakeup_cv;
101+
std::thread m_timer_thread;
102+
std::chrono::steady_clock::time_point m_timeout_start;
103+
};
104+
105+
} // namespace lldb_private
106+
107+
#endif // LLDB_TARGET_THREADPLANSINGLETHREADTIMEOUT_H

lldb/include/lldb/Target/ThreadPlanStepOut.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ class ThreadPlanStepOut : public ThreadPlan, public ThreadPlanShouldStopHere {
3030
bool ValidatePlan(Stream *error) override;
3131
bool ShouldStop(Event *event_ptr) override;
3232
bool StopOthers() override;
33+
void SetStopOthers(bool new_value) override { m_stop_others = new_value; }
3334
lldb::StateType GetPlanRunState() override;
3435
bool WillStop() override;
3536
bool MischiefManaged() override;

lldb/include/lldb/Target/ThreadPlanStepOverRange.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,13 @@
1313
#include "lldb/Target/StackID.h"
1414
#include "lldb/Target/Thread.h"
1515
#include "lldb/Target/ThreadPlanStepRange.h"
16+
#include "lldb/Target/TimeoutResumeAll.h"
1617

1718
namespace lldb_private {
1819

1920
class ThreadPlanStepOverRange : public ThreadPlanStepRange,
20-
ThreadPlanShouldStopHere {
21+
ThreadPlanShouldStopHere,
22+
TimeoutResumeAll {
2123
public:
2224
ThreadPlanStepOverRange(Thread &thread, const AddressRange &range,
2325
const SymbolContext &addr_context,
@@ -27,7 +29,9 @@ class ThreadPlanStepOverRange : public ThreadPlanStepRange,
2729
~ThreadPlanStepOverRange() override;
2830

2931
void GetDescription(Stream *s, lldb::DescriptionLevel level) override;
32+
void SetStopOthers(bool new_value) override;
3033
bool ShouldStop(Event *event_ptr) override;
34+
void DidPush() override;
3135

3236
protected:
3337
bool DoPlanExplainsStop(Event *event_ptr) override;
@@ -44,6 +48,7 @@ class ThreadPlanStepOverRange : public ThreadPlanStepRange,
4448
bool IsEquivalentContext(const SymbolContext &context);
4549

4650
bool m_first_resume;
51+
lldb::RunMode m_run_mode;
4752

4853
ThreadPlanStepOverRange(const ThreadPlanStepOverRange &) = delete;
4954
const ThreadPlanStepOverRange &

lldb/include/lldb/Target/ThreadPlanStepRange.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,15 @@ class ThreadPlanStepRange : public ThreadPlan {
5858
// run' plan, then just single step.
5959
bool SetNextBranchBreakpoint();
6060

61+
// Whether the input stop info is caused by the next branch breakpoint.
62+
// Note: this does not check if branch breakpoint site is shared by other
63+
// breakpoints or not.
64+
bool IsNextBranchBreakpointStop(lldb::StopInfoSP stop_info_sp);
65+
6166
void ClearNextBranchBreakpoint();
6267

68+
void ClearNextBranchBreakpointExplainedStop();
69+
6370
bool NextRangeBreakpointExplainsStop(lldb::StopInfoSP stop_info_sp);
6471

6572
SymbolContext m_addr_context;
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
//===-- TimeoutResumeAll.h -------------------------*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLDB_TARGET_TIMEOUTRESUMEALL_H
10+
#define LLDB_TARGET_TIMEOUTRESUMEALL_H
11+
12+
#include "lldb/Target/ThreadPlanSingleThreadTimeout.h"
13+
14+
namespace lldb_private {
15+
16+
// Mixin class that provides the capability for ThreadPlan to support single
17+
// thread execution that resumes all threads after a timeout.
18+
// Opt-in thread plan should call PushNewTimeout() in its DidPush() and
19+
// ResumeWithTimeout() during DoWillResume().
20+
class TimeoutResumeAll {
21+
public:
22+
TimeoutResumeAll(Thread &thread) : m_thread(thread) {}
23+
24+
void PushNewTimeout() {
25+
ThreadPlanSingleThreadTimeout::PushNewWithTimeout(m_thread, m_timeout_info);
26+
}
27+
28+
void ResumeWithTimeout() {
29+
ThreadPlanSingleThreadTimeout::ResumeFromPrevState(m_thread,
30+
m_timeout_info);
31+
}
32+
33+
private:
34+
Thread &m_thread;
35+
ThreadPlanSingleThreadTimeout::TimeoutInfo m_timeout_info;
36+
};
37+
38+
} // namespace lldb_private
39+
40+
#endif // LLDB_TARGET_TIMEOUTRESUMEALL_H

lldb/include/lldb/lldb-enumerations.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ enum StopReason {
253253
eStopReasonFork,
254254
eStopReasonVFork,
255255
eStopReasonVForkDone,
256+
eStopReasonInterrupt, ///< Thread requested interrupt
256257
};
257258

258259
/// Command Return Status Types.

lldb/source/API/SBThread.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,9 @@ size_t SBThread::GetStopReasonDataCount() {
192192
case eStopReasonSignal:
193193
return 1;
194194

195+
case eStopReasonInterrupt:
196+
return 1;
197+
195198
case eStopReasonException:
196199
return 1;
197200

@@ -261,6 +264,9 @@ uint64_t SBThread::GetStopReasonDataAtIndex(uint32_t idx) {
261264
case eStopReasonSignal:
262265
return stop_info_sp->GetValue();
263266

267+
case eStopReasonInterrupt:
268+
return stop_info_sp->GetValue();
269+
264270
case eStopReasonException:
265271
return stop_info_sp->GetValue();
266272

lldb/source/Interpreter/CommandInterpreter.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2513,7 +2513,7 @@ bool CommandInterpreter::DidProcessStopAbnormally() const {
25132513
const StopReason reason = stop_info->GetStopReason();
25142514
if (reason == eStopReasonException ||
25152515
reason == eStopReasonInstrumentation ||
2516-
reason == eStopReasonProcessorTrace)
2516+
reason == eStopReasonProcessorTrace || reason == eStopReasonInterrupt)
25172517
return true;
25182518

25192519
if (reason == eStopReasonSignal) {

lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerLLGS.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,8 @@ static const char *GetStopReasonString(StopReason stop_reason) {
714714
return "vfork";
715715
case eStopReasonVForkDone:
716716
return "vforkdone";
717+
case eStopReasonInterrupt:
718+
return "async interrupt";
717719
case eStopReasonInstrumentation:
718720
case eStopReasonInvalid:
719721
case eStopReasonPlanComplete:

0 commit comments

Comments
 (0)