Skip to content

mbed_error.c: Better HW fault exceptions and stack dump #11332

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Sep 6, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
9af7ea5
The candidate code to enable correct crash report for HW fault crashes.
andrewc-arm Aug 26, 2019
d69dc49
Fixed astyle failures on handle_error().
andrewc-arm Aug 26, 2019
57ed4f3
Removed #warning of non-Cortex-M. Minor comment adjustment.
andrewc-arm Aug 27, 2019
50daa7e
platform.stack-dump-enabled feature is added.
andrewc-arm Jun 26, 2019
2487461
Now stack dump is done correctly to the other way around.
andrewc-arm Jun 28, 2019
575bd31
print_error_report: Don't care about osRtxStackFillPattern and dump w…
andrewc-arm Aug 27, 2019
835a763
print_error_report: Paying extra caution on address alignment.
andrewc-arm Aug 27, 2019
5671416
handle_error: Now supports HW exception from handler mode.
andrewc-arm Sep 3, 2019
db1e2d3
astyle to mbed_error.c
andrewc-arm Sep 3, 2019
d83bbd9
Reverted back the astylerc.
andrewc-arm Sep 3, 2019
7dc5317
mbed_error.c: Now we can stack dump on all possible threads.
andrewc-arm Sep 3, 2019
5258768
The stack dump of the major should be moved up.
andrewc-arm Sep 3, 2019
95d2eb8
platform: stack-dump-enabled default value to false.
andrewc-arm Sep 4, 2019
da2f8e5
mbed_error.c: reducing RTX restriction.
andrewc-arm Sep 4, 2019
0da589e
mbed_error.c: handler mode failures will dump MSP/PSP stacks.
andrewc-arm Sep 4, 2019
76353d5
mbed_error.c: now dumping MSP/PSP stacks correctly.
andrewc-arm Sep 4, 2019
a769b7d
mbed_error.c: Fixed compile errors with non-RTX targets.
andrewc-arm Sep 5, 2019
bd9ec8b
mbed_error.c: fixed the dump core function's bug of possible stack ov…
andrewc-arm Sep 6, 2019
c257c5f
mbed_error.c: Fixed another bug of possible stack overflow.
andrewc-arm Sep 6, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions platform/mbed_lib.json
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,12 @@
"value": null
},

"stack-dump-enabled": {
"macro_name": "MBED_STACK_DUMP_ENABLED",
"help": "Set to true to enable stack dump.",
"value": false
},

"cpu-stats-enabled": {
"macro_name": "MBED_CPU_STATS_ENABLED",
"help": "Set to 1 to enable cpu stats. When enabled the function mbed_stats_cpu_get returns non-zero data. See mbed_stats.h for more information",
Expand Down
2 changes: 1 addition & 1 deletion platform/source/TARGET_CORTEX_M/mbed_fault_handler.c
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ void mbed_fault_handler(uint32_t fault_type, const mbed_fault_context_t *mbed_fa
mbed_error_printf("\n\n-- MbedOS Fault Handler --\n\n");

//Now call mbed_error, to log the error and halt the system
mbed_error(faultStatus, "Fault exception", mbed_fault_context->PC_reg, NULL, 0);
mbed_error(faultStatus, "Fault exception", (unsigned int)mbed_fault_context_in, NULL, 0);

}

Expand Down
126 changes: 118 additions & 8 deletions platform/source/mbed_error.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#include "platform/mbed_interface.h"
#include "platform/mbed_power_mgmt.h"
#include "platform/mbed_stats.h"
#include "platform/source/TARGET_CORTEX_M/mbed_fault_handler.h"
#include "mbed_rtx.h"
#ifdef MBED_CONF_RTOS_PRESENT
#include "rtx_os.h"
#endif
Expand All @@ -38,6 +40,10 @@
#endif
#include <inttypes.h>

#ifndef MAX
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#endif

#ifndef NDEBUG
#define ERROR_REPORT(ctx, error_msg, error_filename, error_line) print_error_report(ctx, error_msg, error_filename, error_line)
static void print_error_report(const mbed_error_ctx *ctx, const char *, const char *error_filename, int error_line);
Expand Down Expand Up @@ -132,6 +138,26 @@ WEAK MBED_NORETURN void error(const char *format, ...)
mbed_halt_system();
}

static inline bool mbed_error_is_hw_fault(mbed_error_status_t error_status)
{
return (error_status == MBED_ERROR_MEMMANAGE_EXCEPTION ||
error_status == MBED_ERROR_BUSFAULT_EXCEPTION ||
error_status == MBED_ERROR_USAGEFAULT_EXCEPTION ||
error_status == MBED_ERROR_HARDFAULT_EXCEPTION);
}

static bool mbed_error_is_handler(const mbed_error_ctx *ctx)
{
bool is_handler = false;
if (ctx && mbed_error_is_hw_fault(ctx->error_status)) {
mbed_fault_context_t *mfc = (mbed_fault_context_t *)ctx->error_value;
if (mfc && !(mfc->EXC_RETURN & 0x8)) {
is_handler = true;
}
}
return is_handler;
}

//Set an error status with the error handling system
static mbed_error_status_t handle_error(mbed_error_status_t error_status, unsigned int error_value, const char *filename, int line_number, void *caller)
{
Expand All @@ -146,18 +172,29 @@ static mbed_error_status_t handle_error(mbed_error_status_t error_status, unsign

//Clear the context capturing buffer
memset(&current_error_ctx, 0, sizeof(mbed_error_ctx));

//Capture error information
current_error_ctx.error_status = error_status;
current_error_ctx.error_address = (uint32_t)caller;
current_error_ctx.error_value = error_value;
mbed_fault_context_t *mfc = NULL;
if (mbed_error_is_hw_fault(error_status)) {
mfc = (mbed_fault_context_t *)error_value;
current_error_ctx.error_address = (uint32_t)mfc->PC_reg;
// Note that this SP_reg is the correct SP value of the fault. PSP and MSP are slightly different due to HardFault_Handler.
current_error_ctx.thread_current_sp = (uint32_t)mfc->SP_reg;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"Thread current SP" would actually be PSP.

SP_reg will be either thread SP (PSP) if the exception happened in thread mode, or main SP (MSP) if the exception happened in handler mode. Second case is less likely but possible.

I think you want PSP specifically here, as you are going to show it as part of the "current thread" line, next to the pointer for the thread base, as if intending to compare them.

(Although it doesn't matter that much, as the fault handler should have printed all of SP, MSP and PSP anyway before entering us, right?)

Not sure the warning is the way to go - we have no portable non-M fault handler, so how could any using a Cortex-A "implement a non Cortex-M handler" in this file to silence it? I would just let it use the existing code, and skip the warning.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kjbracey-arm

I think you want PSP specifically here,

Please note that there is an adjustment of PSP/MSP on this assembly code. It looks like the assembly code is adjusting the PSP/MSP value offset caused by calling Fault_Handler. I experimented with gdb and made sure that SP_reg value is the exact same with SP register value just before the crash. If you observe teraterm_20190826.1656.log, you can see that SP and PSP are different.

[2019-08-26 16:57:01.354] SP   : 20002D30 <== correct value just before the crash
[2019-08-26 16:57:01.371] LR   : 000017DF
[2019-08-26 16:57:01.389] PC   : 000017C4
[2019-08-26 16:57:01.403] xPSR : 21000000
[2019-08-26 16:57:01.421] PSP  : 20002CC8 <== a bit tainted value due to Fault_Handler
[2019-08-26 16:57:01.438] MSP  : 2002FFC0

(Although it doesn't matter that much, as the fault handler should have printed all of SP, MSP and PSP anyway before entering us, right?)

Yes. We print all the context as shown above. And we print this mode information as well.

[2019-08-26 16:57:01.571] Mode : Thread
[2019-08-26 16:57:01.586] Priv : Privileged
[2019-08-26 16:57:01.604] Stack: PSP

I would just let it use the existing code, and skip the warning.

OK. I removed the warning in bdfef3f.

Thanks!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay, that complicates a bit. So SP is a better value for "thread SP" than PSP if the exception was from thread mode. But if the exception was from handler mode, then SP is a better value for "handler SP" than MSP.

Then if you're looking for "thread stack pointer" specifically, then you'd want to take SP if from exception was from thread mode, or PSP if from handler mode. So you need to look at the SPSEL bit of the exception return value.

(I've been around all this stuff working on exception handling for pyOCD, which has to do the same sort of reconstruction logic, trying to cover all cases).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kjbracey-arm

Okay, that complicates a bit. So SP is a better value for "thread SP" than PSP if the exception was from thread mode. But if the exception was from handler mode, then SP is a better value for "handler SP" than MSP.

Yes. I believe SP is the better value in both cases. If that's what you meant.

In case of thread mode, PSP is provided as a source to SP via except.S#L112. In case of handler mode, MSP is provided as source to SP via except.S#L107. Our assembly already checks EXC_RETURN from except.S#L110.

For stack dump feature upcoming, I will check the mode and dump the right PSP/MSP. For this PR, I believe providing SP as it is a good choice.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In case of handler mode, MSP is provided as source to SP via except.S#L107.

So in that case SP is not a thread stack pointer at all. If you're not able to do adaptive behaviour here, then a fixed choice of SP is better to get precision for the common case of exception in thread mode, so it's the 1 to go for, even though it's worse for the rare case of exception in handler mode.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now I understand your point. You mean this printout

Current Thread: main  Id: 0x20000E50 Entry: 0x3D13 StackSize: 0x1000 StackMem: 0x20001E60 SP: 0x20002D30

should also be changed to correctly show the handler stack instead of the thread stack. I will test with handler mode crashes and see if I can improve today.

@0xc0170 , please hold off merging. 😊

Copy link
Contributor

@kjbracey kjbracey Sep 2, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current thread information is still useful in that case.

With the thread dumping info on (I'm not sure why it isn't by default), it would be the missing (and most important piece) of the full thread dump. The thread information extracted from the OS is only up-to-date for the non-active threads - we have to get the current thread's from the registers.

But if the crash is actually in handler mode, it could either be due to a random interrupt thing, totally unconnected to the current thread, or it could be due to something the current thread did (maybe enabling interrupts at a bad time, or some bug in a supervisor call to RTOS).

So it would make sense to show both current thread and handler mode info when crashing in handler mode. Either or both could be important.

Again - look at pyOCD, and this is basically how it displays info with its thread awareness.

It shows:

  • Current backtrace for current stack (by examining SP).
  • If in handler mode (main stack), also show thread and backtrace for process stack (by examining MSPPSP).
  • If RTOS awareness, also show thread and backtrace for other non-current RTOS threads (by examining RTOS global state structures)

So the handler mode stack appears whenever stopped in handler mode, else you don't see it and only see threads.

This then ties in with pyocd/pyOCD#430, which I really should get merged, which makes it show the aborting thread by default when stopping in the fault handler - at present, it shows you as having "SIGSEGV at fault handler", rather than the more logical "SIGSEGV at main thread location". The first is technically true at a hardware level, because it has entered the fault handler by the time the debugger gets there, but it's not the way that signals are supposed to be reported in GDB. (It's not as if your thing here is going to say hard fault at fault handler, is it?)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kjbracey-arm
Please review the original description of this request.

  • Stack dump is enabled via "platform.stack-dump-enabled": true mbed_app.json configuration.
  • If stack dump is used in conjunction with "platform.error-all-threads-info": true, all the threads' stack will be dumped to provide holistic overview of all the threads.

Now with above two configs set to true, Mbed-OS will print the whole stack including the handler mode's stack.

Also, note how handler mode's printout is cleaned up like this.

Current Thread: <handler>  Id: 0x0 Entry: 0x0 StackSize: 0xE4 StackMem: 0x2002FF18 SP: 0x2002FF18 

There is no thread id in handler and there is no entry. Or if you know, please let me know.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you want a handler pseudo-thread (and that's a reasonable way to present it), it should be in addition to the current OS thread, rather than replacing it (like pyOCD). If I read the current code correctly, if I pass a bad pointer to osMutexLock, then we could enter the SVC handler, then hard fault inside it, and we wouldn't see the current thread that did that. We'd see "handler" and all other threads, and no backtrace showing the mutex lock at all.

(You should be able to test literally that scenario - do something like osMutexLock((osMutexId_t) 0xDEADDEAD)).

In the handler crash scenario, we don't know whether it's the handler's fault (eg bad interrupt handler) or the current thread's fault, like that mutex lock. So we should try to get both backtraces.

// Note that the RTX thread itself is the same even under this fault exception handler.
} else {
current_error_ctx.error_address = (uint32_t)caller;
current_error_ctx.thread_current_sp = (uint32_t)&current_error_ctx; // Address local variable to get a stack pointer
}

#ifdef MBED_CONF_RTOS_PRESENT
//Capture thread info
// Capture thread info in thread mode
osRtxThread_t *current_thread = osRtxInfo.thread.run.curr;
current_error_ctx.thread_id = (uint32_t)current_thread;
current_error_ctx.thread_entry_address = (uint32_t)current_thread->thread_addr;
current_error_ctx.thread_stack_size = current_thread->stack_size;
current_error_ctx.thread_stack_mem = (uint32_t)current_thread->stack_mem;
current_error_ctx.thread_current_sp = (uint32_t)&current_error_ctx; // Address local variable to get a stack pointer
#endif //MBED_CONF_RTOS_PRESENT

#if MBED_CONF_PLATFORM_ERROR_FILENAME_CAPTURE_ENABLED
Expand Down Expand Up @@ -440,16 +477,83 @@ mbed_error_status_t mbed_clear_all_errors(void)
return status;
}

static inline const char *name_or_unnamed(const char *name)
#ifdef MBED_CONF_RTOS_PRESENT
static inline const char *name_or_unnamed(const osRtxThread_t *thread)
{
const char *unnamed = "<unnamed>";

if (!thread) {
return unnamed;
}

const char *name = thread->name;
return name ? name : unnamed;
}
#endif // MBED_CONF_RTOS_PRESENT

#if MBED_STACK_DUMP_ENABLED
/** Prints stack dump from given stack information.
* The arguments should be given in address raw value to check alignment.
* @param stack_start The address of stack start.
* @param stack_size The size of stack
* @param stack_sp The stack pointer currently at. */
static void print_stack_dump_core(uint32_t stack_start, uint32_t stack_size, uint32_t stack_sp, const char *postfix)
{
#if MBED_STACK_DUMP_ENABLED
#define STACK_DUMP_WIDTH 8
#define INT_ALIGN_MASK (~(sizeof(int) - 1))
mbed_error_printf("\nStack Dump: %s", postfix);
uint32_t st_end = (stack_start + stack_size) & INT_ALIGN_MASK;
uint32_t st = (stack_sp) & INT_ALIGN_MASK;
for (; st < st_end; st += sizeof(int) * STACK_DUMP_WIDTH) {
mbed_error_printf("\n0x%08" PRIX32 ":", st);
for (int i = 0; i < STACK_DUMP_WIDTH; i++) {
uint32_t st_cur = st + i * sizeof(int);
if (st_cur >= st_end) {
break;
}
uint32_t st_val = *((uint32_t *)st_cur);
mbed_error_printf("0x%08" PRIX32 " ", st_val);
}
}
mbed_error_printf("\n");
#endif // MBED_STACK_DUMP_ENABLED
}

static void print_stack_dump(uint32_t stack_start, uint32_t stack_size, uint32_t stack_sp, const mbed_error_ctx *ctx)
{
return name ? name : "<unnamed>";
if (ctx && mbed_error_is_handler(ctx)) {
// Stack dump extra for handler stack which may have accessed MSP.
mbed_fault_context_t *mfc = (mbed_fault_context_t *)ctx->error_value;
uint32_t msp_sp = mfc->MSP;
uint32_t psp_sp = mfc->PSP;
if (mfc && !(mfc->EXC_RETURN & 0x4)) {
// MSP mode. Then SP_reg is more correct.
msp_sp = mfc->SP_reg;
} else {
// PSP mode. Then SP_reg is more correct.
psp_sp = mfc->SP_reg;
}
uint32_t msp_size = MAX(0, (int)INITIAL_SP - (int)msp_sp);
print_stack_dump_core(msp_sp, msp_size, msp_sp, "MSP");

stack_sp = psp_sp;
}

print_stack_dump_core(stack_start, stack_size, stack_sp, "PSP");
}
#endif // MBED_STACK_DUMP_ENABLED

#if MBED_CONF_PLATFORM_ERROR_ALL_THREADS_INFO && defined(MBED_CONF_RTOS_PRESENT)
/* Prints info of a thread(using osRtxThread_t struct)*/
static void print_thread(const osRtxThread_t *thread)
{
mbed_error_printf("\n%s State: 0x%" PRIX8 " Entry: 0x%08" PRIX32 " Stack Size: 0x%08" PRIX32 " Mem: 0x%08" PRIX32 " SP: 0x%08" PRIX32, name_or_unnamed(thread->name), thread->state, thread->thread_addr, thread->stack_size, (uint32_t)thread->stack_mem, thread->sp);
uint32_t stack_mem = (uint32_t)thread->stack_mem;
mbed_error_printf("\n%s State: 0x%" PRIX8 " Entry: 0x%08" PRIX32 " Stack Size: 0x%08" PRIX32 " Mem: 0x%08" PRIX32 " SP: 0x%08" PRIX32, name_or_unnamed(thread), thread->state, thread->thread_addr, thread->stack_size, stack_mem, thread->sp);

#if MBED_STACK_DUMP_ENABLED
print_stack_dump(stack_mem, thread->stack_size, thread->sp, NULL);
#endif
}

/* Prints thread info from a list */
Expand Down Expand Up @@ -532,11 +636,16 @@ static void print_error_report(const mbed_error_ctx *ctx, const char *error_msg,

mbed_error_printf("\nError Value: 0x%" PRIX32, ctx->error_value);
#ifdef MBED_CONF_RTOS_PRESENT
mbed_error_printf("\nCurrent Thread: %s Id: 0x%" PRIX32 " Entry: 0x%" PRIX32 " StackSize: 0x%" PRIX32 " StackMem: 0x%" PRIX32 " SP: 0x%" PRIX32 " ",
name_or_unnamed(((osRtxThread_t *)ctx->thread_id)->name),
bool is_handler = mbed_error_is_handler(ctx);
mbed_error_printf("\nCurrent Thread: %s%s Id: 0x%" PRIX32 " Entry: 0x%" PRIX32 " StackSize: 0x%" PRIX32 " StackMem: 0x%" PRIX32 " SP: 0x%" PRIX32 " ",
name_or_unnamed((osRtxThread_t *)ctx->thread_id), is_handler ? " <handler>" : "",
ctx->thread_id, ctx->thread_entry_address, ctx->thread_stack_size, ctx->thread_stack_mem, ctx->thread_current_sp);
#endif

#if MBED_STACK_DUMP_ENABLED
print_stack_dump(ctx->thread_stack_mem, ctx->thread_stack_size, ctx->thread_current_sp, ctx);
#endif

#if MBED_CONF_PLATFORM_ERROR_ALL_THREADS_INFO && defined(MBED_CONF_RTOS_PRESENT)
mbed_error_printf("\nNext:");
print_thread(osRtxInfo.thread.run.next);
Expand All @@ -557,6 +666,7 @@ static void print_error_report(const mbed_error_ctx *ctx, const char *error_msg,
mbed_stats_sys_get(&sys_stats);
mbed_error_printf("\nFor more info, visit: https://mbed.com/s/error?error=0x%08X&osver=%" PRId32 "&core=0x%08" PRIX32 "&comp=%d&ver=%" PRIu32 "&tgt=" GET_TARGET_NAME(TARGET_NAME), ctx->error_status, sys_stats.os_version, sys_stats.cpu_id, sys_stats.compiler_id, sys_stats.compiler_version);
#endif

mbed_error_printf("\n-- MbedOS Error Info --\n");
}
#endif //ifndef NDEBUG
Expand Down