testing: use monotonic counts to attribute races in subtests

Bryan C. Mills · gopherbot · commit 55b8e16b2eb7 · 2023-10-25T20:44:25.000Z
This implements the approach I described in https://go-review.git.corp.google.com/c/go/+/494057/1#message-5c9773bded2f89b4058848cb036b860aa6716de3. Specifically: - Each level of test atomically records the cumulative number of races seen as of the last race-induced test failure. - When a subtest fails, it logs the race error, and then updates its parents' counters so that they will not log the same error. - We check each test or benchmark for races before it starts running each of its subtests or sub-benchmark, before unblocking parallel subtests, and after running any cleanup functions. With this implementation, it should be the case that every test that is running when a race is detected reports that race, and any race reported for a subtest is not redundantly reported for its parent. The regression tests are based on those added in CL 494057 and CL 501895, with a few additions based on my own review of the code. Fixes #60083. Change-Id: I578ae929f192a7a951b31b17ecb560cbbf1ef7a1 Cq-Include-Trybots: luci.golang.try:gotip-linux-amd64-longtest,gotip-linux-amd64-longtest-race,gotip-windows-amd64-longtest Reviewed-on: https://go-review.googlesource.com/c/go/+/506300 Reviewed-by: Ian Lance Taylor <iant@google.com> Auto-Submit: Bryan Mills <bcmills@google.com> LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
diff --git a/src/runtime/race/output_test.go b/src/runtime/race/output_test.go
@@ -208,8 +208,8 @@ func TestFail(t *testing.T) {
 `, []string{`
 ==================
 --- FAIL: TestFail \([0-9.]+s\)
-.*main_test.go:14: true
 .*testing.go:.*: race detected during execution of test
+.*main_test.go:14: true
 FAIL`}},
 
 	{"slicebytetostring_pc", "run", "", "atexit_sleep_ms=0", `
diff --git a/src/testing/benchmark.go b/src/testing/benchmark.go
@@ -7,7 +7,6 @@ package testing
 import (
 	"flag"
 	"fmt"
-	"internal/race"
 	"internal/sysinfo"
 	"io"
 	"math"
@@ -179,11 +178,14 @@ func (b *B) ReportAllocs() {
 func (b *B) runN(n int) {
 	benchmarkLock.Lock()
 	defer benchmarkLock.Unlock()
-	defer b.runCleanup(normalPanic)
+	defer func() {
+		b.runCleanup(normalPanic)
+		b.checkRaces()
+	}()
 	// Try to get a comparable environment for each run
 	// by clearing garbage from previous runs.
 	runtime.GC()
-	b.raceErrors = -race.Errors()
+	b.resetRaces()
 	b.N = n
 	b.parallelism = 1
 	b.ResetTimer()
@@ -192,10 +194,6 @@ func (b *B) runN(n int) {
 	b.StopTimer()
 	b.previousN = n
 	b.previousDuration = b.duration
-	b.raceErrors += race.Errors()
-	if b.raceErrors > 0 {
-		b.Errorf("race detected during execution of benchmark")
-	}
 }
 
 // run1 runs the first iteration of benchFunc. It reports whether more
diff --git a/src/testing/fuzz.go b/src/testing/fuzz.go
@@ -636,6 +636,7 @@ func fRunner(f *F, fn func(*F)) {
 		// Unfortunately, recovering here adds stack frames, but the location of
 		// the original panic should still be
 		// clear.
+		f.checkRaces()
 		if f.Failed() {
 			numFailed.Add(1)
 		}
@@ -719,6 +720,7 @@ func fRunner(f *F, fn func(*F)) {
 	}()
 
 	f.start = time.Now()
+	f.resetRaces()
 	fn(f)
 
 	// Code beyond this point will not be executed when FailNow or SkipNow
diff --git a/src/testing/testing.go b/src/testing/testing.go
@@ -611,7 +611,6 @@ type common struct {
 	bench          bool           // Whether the current test is a benchmark.
 	hasSub         atomic.Bool    // whether there are sub-benchmarks.
 	cleanupStarted atomic.Bool    // Registered cleanup callbacks have started to execute
-	raceErrors     int            // Number of races detected during test.
 	runner         string         // Function name of tRunner running the test.
 	isParallel     bool           // Whether the test is parallel.
 
@@ -625,6 +624,9 @@ type common struct {
 	signal   chan bool // To signal a test is done.
 	sub      []*T      // Queue of subtests to be run in parallel.
 
+	lastRaceErrors  atomic.Int64 // Max value of race.Errors seen during the test or its subtests.
+	raceErrorLogged atomic.Bool
+
 	tempDirMu  sync.Mutex
 	tempDir    string
 	tempDirErr error
@@ -955,9 +957,15 @@ func (c *common) Fail() {
 // Failed reports whether the function has failed.
 func (c *common) Failed() bool {
 	c.mu.RLock()
-	failed := c.failed
-	c.mu.RUnlock()
-	return failed || c.raceErrors+race.Errors() > 0
+	defer c.mu.RUnlock()
+
+	if !c.done && int64(race.Errors()) > c.lastRaceErrors.Load() {
+		c.mu.RUnlock()
+		c.checkRaces()
+		c.mu.RLock()
+	}
+
+	return c.failed
 }
 
 // FailNow marks the function as having failed and stops its execution
@@ -1346,6 +1354,69 @@ func (c *common) runCleanup(ph panicHandling) (panicVal any) {
 	}
 }
 
+// resetRaces updates c.parent's count of data race errors (or the global count,
+// if c has no parent), and updates c.lastRaceErrors to match.
+//
+// Any races that occurred prior to this call to resetRaces will
+// not be attributed to c.
+func (c *common) resetRaces() {
+	if c.parent == nil {
+		c.lastRaceErrors.Store(int64(race.Errors()))
+	} else {
+		c.lastRaceErrors.Store(c.parent.checkRaces())
+	}
+}
+
+// checkRaces checks whether the global count of data race errors has increased
+// since c's count was last reset.
+//
+// If so, it marks c as having failed due to those races (logging an error for
+// the first such race), and updates the race counts for the parents of c so
+// that if they are currently suspended (such as in a call to T.Run) they will
+// not log separate errors for the race(s).
+//
+// Note that multiple tests may be marked as failed due to the same race if they
+// are executing in parallel.
+func (c *common) checkRaces() (raceErrors int64) {
+	raceErrors = int64(race.Errors())
+	for {
+		last := c.lastRaceErrors.Load()
+		if raceErrors <= last {
+			// All races have already been reported.
+			return raceErrors
+		}
+		if c.lastRaceErrors.CompareAndSwap(last, raceErrors) {
+			break
+		}
+	}
+
+	if c.raceErrorLogged.CompareAndSwap(false, true) {
+		// This is the first race we've encountered for this test.
+		// Mark the test as failed, and log the reason why only once.
+		// (Note that the race detector itself will still write a goroutine
+		// dump for any further races it detects.)
+		c.Errorf("race detected during execution of test")
+	}
+
+	// Update the parent(s) of this test so that they don't re-report the race.
+	parent := c.parent
+	for parent != nil {
+		for {
+			last := parent.lastRaceErrors.Load()
+			if raceErrors <= last {
+				// This race was already reported by another (likely parallel) subtest.
+				return raceErrors
+			}
+			if parent.lastRaceErrors.CompareAndSwap(last, raceErrors) {
+				break
+			}
+		}
+		parent = parent.parent
+	}
+
+	return raceErrors
+}
+
 // callerName gives the function name (qualified with a package path)
 // for the caller after skip frames (where 0 means the current function).
 func callerName(skip int) string {
@@ -1390,7 +1461,18 @@ func (t *T) Parallel() {
 
 	// Add to the list of tests to be released by the parent.
 	t.parent.sub = append(t.parent.sub, t)
-	t.raceErrors += race.Errors()
+
+	// Report any races during execution of this test up to this point.
+	//
+	// We will assume that any races that occur between here and the point where
+	// we unblock are not caused by this subtest. That assumption usually holds,
+	// although it can be wrong if the test spawns a goroutine that races in the
+	// background while the rest of the test is blocked on the call to Parallel.
+	// If that happens, we will misattribute the background race to some other
+	// test, or to no test at all — but that false-negative is so unlikely that it
+	// is not worth adding race-report noise for the common case where the test is
+	// completely suspended during the call to Parallel.
+	t.checkRaces()
 
 	if t.chatty != nil {
 		t.chatty.Updatef(t.name, "=== PAUSE %s\n", t.name)
@@ -1405,9 +1487,16 @@ func (t *T) Parallel() {
 		t.chatty.Updatef(t.name, "=== CONT  %s\n", t.name)
 	}
 	running.Store(t.name, time.Now())
-
 	t.start = time.Now()
-	t.raceErrors += -race.Errors()
+
+	// Reset the local race counter to ignore any races that happened while this
+	// goroutine was blocked, such as in the parent test or in other parallel
+	// subtests.
+	//
+	// (Note that we don't call parent.checkRaces here:
+	// if other parallel subtests have already introduced races, we want to
+	// let them report those races instead of attributing them to the parent.)
+	t.lastRaceErrors.Store(int64(race.Errors()))
 }
 
 // Setenv calls os.Setenv(key, value) and uses Cleanup to
@@ -1455,14 +1544,13 @@ func tRunner(t *T, fn func(t *T)) {
 	// a call to runtime.Goexit, record the duration and send
 	// a signal saying that the test is done.
 	defer func() {
+		t.checkRaces()
+
+		// TODO(#61034): This is the wrong place for this check.
 		if t.Failed() {
 			numFailed.Add(1)
 		}
 
-		if t.raceErrors+race.Errors() > 0 {
-			t.Errorf("race detected during execution of test")
-		}
-
 		// Check if the test panicked or Goexited inappropriately.
 		//
 		// If this happens in a normal test, print output but continue panicking.
@@ -1564,6 +1652,7 @@ func tRunner(t *T, fn func(t *T)) {
 			if err != nil {
 				doPanic(err)
 			}
+			t.checkRaces()
 			if !t.isParallel {
 				// Reacquire the count for sequential tests. See comment in Run.
 				t.context.waitParallel()
@@ -1589,7 +1678,7 @@ func tRunner(t *T, fn func(t *T)) {
 	}()
 
 	t.start = time.Now()
-	t.raceErrors = -race.Errors()
+	t.resetRaces()
 	fn(t)
 
 	// code beyond here will not be executed when FailNow is invoked
@@ -1936,7 +2025,12 @@ func (m *M) Run() (code int) {
 				testOk = false
 			}
 		}
-		if !testOk || !exampleOk || !fuzzTargetsOk || !runBenchmarks(m.deps.ImportPath(), m.deps.MatchString, m.benchmarks) || race.Errors() > 0 {
+		anyFailed := !testOk || !exampleOk || !fuzzTargetsOk || !runBenchmarks(m.deps.ImportPath(), m.deps.MatchString, m.benchmarks)
+		if !anyFailed && race.Errors() > 0 {
+			fmt.Print(chatty.prefix(), "testing: race detected outside of test execution\n")
+			anyFailed = true
+		}
+		if anyFailed {
 			fmt.Print(chatty.prefix(), "FAIL\n")
 			m.exitCode = 1
 			return
diff --git a/src/testing/testing_test.go b/src/testing/testing_test.go