Skip to content

Commit 0262ea1

Browse files
committed
runtime: print a stack trace at "morestack on g0"
Error like "morestack on g0" is one of the errors that is very hard to debug, because often it doesn't print a useful stack trace. The runtime doesn't directly print a stack trace because it is a bad stack state to call print. Sometimes the SIGABRT may trigger a traceback, but sometimes not especially in a cgo binary. Even if it triggers a traceback it often does not include the stack trace of the bad stack. This CL makes it explicitly print a stack trace and throw. The idea is to have some space as an "emergency" crash stack. When the stack is in a really bad state, we switch to the crash stack and do a traceback. Currently only implemented on AMD64 and ARM64. TODO: also handle errors like "morestack on gsignal" and bad systemstack. Also handle other architectures. Change-Id: Ibfc397202f2bb0737c5cbe99f2763de83301c1c1 Reviewed-on: https://go-review.googlesource.com/c/go/+/419435 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Michael Pratt <[email protected]>
1 parent 29b8039 commit 0262ea1

File tree

6 files changed

+147
-27
lines changed

6 files changed

+147
-27
lines changed

src/runtime/asm.s

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,11 @@ TEXT ·sigpanic0(SB),NOSPLIT,$0-0
1212
// See map.go comment on the need for this routine.
1313
TEXT ·mapinitnoop<ABIInternal>(SB),NOSPLIT,$0-0
1414
RET
15+
16+
#ifndef GOARCH_amd64
17+
#ifndef GOARCH_arm64
18+
// stub to appease shared build mode.
19+
TEXT ·switchToCrashStack0<ABIInternal>(SB),NOSPLIT,$0-0
20+
UNDEF
21+
#endif
22+
#endif

src/runtime/asm_amd64.s

Lines changed: 39 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,30 @@ bad:
537537
CALL AX
538538
INT $3
539539

540+
// func switchToCrashStack0(fn func())
541+
TEXT runtime·switchToCrashStack0<ABIInternal>(SB), NOSPLIT, $0-8
542+
MOVQ g_m(R14), BX // curm
543+
544+
// set g to gcrash
545+
LEAQ runtime·gcrash(SB), R14 // g = &gcrash
546+
MOVQ BX, g_m(R14) // g.m = curm
547+
MOVQ R14, m_g0(BX) // curm.g0 = g
548+
get_tls(CX)
549+
MOVQ R14, g(CX)
550+
551+
// switch to crashstack
552+
MOVQ (g_stack+stack_hi)(R14), BX
553+
SUBQ $(4*8), BX
554+
MOVQ BX, SP
555+
556+
// call target function
557+
MOVQ AX, DX
558+
MOVQ 0(AX), AX
559+
CALL AX
560+
561+
// should never return
562+
CALL runtime·abort(SB)
563+
UNDEF
540564

541565
/*
542566
* support for morestack
@@ -551,17 +575,26 @@ bad:
551575
TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
552576
// Cannot grow scheduler stack (m->g0).
553577
get_tls(CX)
554-
MOVQ g(CX), BX
555-
MOVQ g_m(BX), BX
556-
MOVQ m_g0(BX), SI
557-
CMPQ g(CX), SI
578+
MOVQ g(CX), DI // DI = g
579+
MOVQ g_m(DI), BX // BX = m
580+
581+
// Set g->sched to context in f.
582+
MOVQ 0(SP), AX // f's PC
583+
MOVQ AX, (g_sched+gobuf_pc)(DI)
584+
LEAQ 8(SP), AX // f's SP
585+
MOVQ AX, (g_sched+gobuf_sp)(DI)
586+
MOVQ BP, (g_sched+gobuf_bp)(DI)
587+
MOVQ DX, (g_sched+gobuf_ctxt)(DI)
588+
589+
MOVQ m_g0(BX), SI // SI = m.g0
590+
CMPQ DI, SI
558591
JNE 3(PC)
559592
CALL runtime·badmorestackg0(SB)
560593
CALL runtime·abort(SB)
561594

562595
// Cannot grow signal stack (m->gsignal).
563596
MOVQ m_gsignal(BX), SI
564-
CMPQ g(CX), SI
597+
CMPQ DI, SI
565598
JNE 3(PC)
566599
CALL runtime·badmorestackgsignal(SB)
567600
CALL runtime·abort(SB)
@@ -573,17 +606,7 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
573606
MOVQ AX, (m_morebuf+gobuf_pc)(BX)
574607
LEAQ 16(SP), AX // f's caller's SP
575608
MOVQ AX, (m_morebuf+gobuf_sp)(BX)
576-
get_tls(CX)
577-
MOVQ g(CX), SI
578-
MOVQ SI, (m_morebuf+gobuf_g)(BX)
579-
580-
// Set g->sched to context in f.
581-
MOVQ 0(SP), AX // f's PC
582-
MOVQ AX, (g_sched+gobuf_pc)(SI)
583-
LEAQ 8(SP), AX // f's SP
584-
MOVQ AX, (g_sched+gobuf_sp)(SI)
585-
MOVQ BP, (g_sched+gobuf_bp)(SI)
586-
MOVQ DX, (g_sched+gobuf_ctxt)(SI)
609+
MOVQ DI, (m_morebuf+gobuf_g)(BX)
587610

588611
// Call newstack on m->g0's stack.
589612
MOVQ m_g0(BX), BX

src/runtime/asm_arm64.s

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -262,6 +262,30 @@ noswitch:
262262
SUB $8, RSP, R29 // restore FP
263263
B (R3)
264264

265+
// func switchToCrashStack0(fn func())
266+
TEXT runtime·switchToCrashStack0<ABIInternal>(SB), NOSPLIT, $0-8
267+
MOVD R0, R26 // context register
268+
MOVD g_m(g), R1 // curm
269+
270+
// set g to gcrash
271+
MOVD $runtime·gcrash(SB), g // g = &gcrash
272+
BL runtime·save_g(SB) // clobbers R0
273+
MOVD R1, g_m(g) // g.m = curm
274+
MOVD g, m_g0(R1) // curm.g0 = g
275+
276+
// switch to crashstack
277+
MOVD (g_stack+stack_hi)(g), R1
278+
SUB $(4*8), R1
279+
MOVD R1, RSP
280+
281+
// call target function
282+
MOVD 0(R26), R0
283+
CALL (R0)
284+
285+
// should never return
286+
CALL runtime·abort(SB)
287+
UNDEF
288+
265289
/*
266290
* support for morestack
267291
*/
@@ -278,6 +302,16 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
278302
// Cannot grow scheduler stack (m->g0).
279303
MOVD g_m(g), R8
280304
MOVD m_g0(R8), R4
305+
306+
// Called from f.
307+
// Set g->sched to context in f
308+
MOVD RSP, R0
309+
MOVD R0, (g_sched+gobuf_sp)(g)
310+
MOVD R29, (g_sched+gobuf_bp)(g)
311+
MOVD LR, (g_sched+gobuf_pc)(g)
312+
MOVD R3, (g_sched+gobuf_lr)(g)
313+
MOVD R26, (g_sched+gobuf_ctxt)(g)
314+
281315
CMP g, R4
282316
BNE 3(PC)
283317
BL runtime·badmorestackg0(SB)
@@ -290,15 +324,6 @@ TEXT runtime·morestack(SB),NOSPLIT|NOFRAME,$0-0
290324
BL runtime·badmorestackgsignal(SB)
291325
B runtime·abort(SB)
292326

293-
// Called from f.
294-
// Set g->sched to context in f
295-
MOVD RSP, R0
296-
MOVD R0, (g_sched+gobuf_sp)(g)
297-
MOVD R29, (g_sched+gobuf_bp)(g)
298-
MOVD LR, (g_sched+gobuf_pc)(g)
299-
MOVD R3, (g_sched+gobuf_lr)(g)
300-
MOVD R26, (g_sched+gobuf_ctxt)(g)
301-
302327
// Called from f.
303328
// Set m->morebuf to f's callers.
304329
MOVD R3, (m_morebuf+gobuf_pc)(R8) // f's caller's PC

src/runtime/crash_test.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,14 @@ func TestG0StackOverflow(t *testing.T) {
804804
if n := strings.Count(string(out), "morestack on g0\n"); n != 1 {
805805
t.Fatalf("%s\n(exit status %v)", out, err)
806806
}
807+
if runtime.GOARCH == "amd64" || runtime.GOARCH == "arm64" {
808+
// check for a stack trace
809+
want := "runtime.stackOverflow"
810+
if n := strings.Count(string(out), want); n < 5 {
811+
t.Errorf("output does not contain %q at least 5 times:\n%s", want, out)
812+
}
813+
return // it's not a signal-style traceback
814+
}
807815
// Check that it's a signal-style traceback.
808816
if runtime.GOOS != "windows" {
809817
if want := "PC="; !strings.Contains(string(out), want) {

src/runtime/export_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,7 @@ func G0StackOverflow() {
694694
// The stack bounds for g0 stack is not always precise.
695695
// Use an artificially small stack, to trigger a stack overflow
696696
// without actually run out of the system stack (which may seg fault).
697-
g0.stack.lo = sp - 4096
697+
g0.stack.lo = sp - 4096 - stackSystem
698698
g0.stackguard0 = g0.stack.lo + stackGuard
699699
g0.stackguard1 = g0.stackguard0
700700

src/runtime/proc.go

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,20 @@ func badreflectcall() {
516516
//go:nosplit
517517
//go:nowritebarrierrec
518518
func badmorestackg0() {
519-
writeErrStr("fatal: morestack on g0\n")
519+
if !crashStackImplemented {
520+
writeErrStr("fatal: morestack on g0\n")
521+
return
522+
}
523+
524+
g := getg()
525+
switchToCrashStack(func() {
526+
print("runtime: morestack on g0, stack [", hex(g.stack.lo), " ", hex(g.stack.hi), "], sp=", hex(g.sched.sp), ", called from\n")
527+
g.m.traceback = 2 // include pc and sp in stack trace
528+
traceback1(g.sched.pc, g.sched.sp, g.sched.lr, g, 0)
529+
print("\n")
530+
531+
throw("morestack on g0")
532+
})
520533
}
521534

522535
//go:nosplit
@@ -530,6 +543,49 @@ func badctxt() {
530543
throw("ctxt != 0")
531544
}
532545

546+
// crashstack is a space that can be used as the stack when it is
547+
// crashing on bad stack conditions, e.g. morestack on g0.
548+
// gcrash is the corresponding (fake) g.
549+
var crashstack [16384]byte
550+
551+
var gcrash = g{
552+
stack: stack{uintptr(unsafe.Pointer(&crashstack)), uintptr(unsafe.Pointer(&crashstack)) + unsafe.Sizeof(crashstack)},
553+
stackguard0: uintptr(unsafe.Pointer(&crashstack)) + 1000,
554+
stackguard1: uintptr(unsafe.Pointer(&crashstack)) + 1000,
555+
}
556+
557+
var crashingG atomic.Pointer[g]
558+
559+
// Switch to crashstack and call fn, with special handling of
560+
// concurrent and recursive cases.
561+
//
562+
// Nosplit as it is called in a bad stack condition (we know
563+
// morestack would fail).
564+
//
565+
//go:nosplit
566+
//go:nowritebarrierrec
567+
func switchToCrashStack(fn func()) {
568+
me := getg()
569+
if crashingG.CompareAndSwapNoWB(nil, me) {
570+
switchToCrashStack0(fn) // should never return
571+
abort()
572+
}
573+
if crashingG.Load() == me {
574+
// recursive crashing. too bad.
575+
writeErrStr("fatal: recursive switchToCrashStack\n")
576+
abort()
577+
}
578+
// Another g is crashing. Give it some time, hopefully it will finish traceback.
579+
usleep_no_g(100)
580+
writeErrStr("fatal: concurrent switchToCrashStack\n")
581+
abort()
582+
}
583+
584+
const crashStackImplemented = GOARCH == "amd64" || GOARCH == "arm64"
585+
586+
//go:noescape
587+
func switchToCrashStack0(func()) // in assembly
588+
533589
func lockedOSThread() bool {
534590
gp := getg()
535591
return gp.lockedm != 0 && gp.m.lockedg != 0

0 commit comments

Comments
 (0)