Skip to content

Commit 5b5e6c9

Browse files
author
Divjot Arora
committed
GODRIVER-1726 Fix SDAM error handling for write concern errors (mongodb#489)
1 parent 7969412 commit 5b5e6c9

File tree

3 files changed

+127
-14
lines changed

3 files changed

+127
-14
lines changed

mongo/integration/sdam_error_handling_test.go

Lines changed: 101 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ package integration
88

99
import (
1010
"context"
11+
"fmt"
1112
"testing"
1213
"time"
1314

@@ -19,16 +20,18 @@ import (
1920

2021
func TestSDAMErrorHandling(t *testing.T) {
2122
mt := mtest.New(t, noClientOpts)
22-
clientOpts := options.Client().
23-
ApplyURI(mt.ConnString()).
24-
SetRetryWrites(false).
25-
SetPoolMonitor(poolMonitor).
26-
SetWriteConcern(mtest.MajorityWc)
23+
baseClientOpts := func() *options.ClientOptions {
24+
return options.Client().
25+
ApplyURI(mt.ConnString()).
26+
SetRetryWrites(false).
27+
SetPoolMonitor(poolMonitor).
28+
SetWriteConcern(mtest.MajorityWc)
29+
}
2730
baseMtOpts := func() *mtest.Options {
2831
mtOpts := mtest.NewOptions().
2932
Topologies(mtest.ReplicaSet). // Don't run on sharded clusters to avoid complexity of sharded failpoints.
3033
MinServerVersion("4.0"). // 4.0+ is required to use failpoints on replica sets.
31-
ClientOptions(clientOpts)
34+
ClientOptions(baseClientOpts())
3235

3336
if mt.TopologyKind() == mtest.Sharded {
3437
// Pin to a single mongos because the tests use failpoints.
@@ -173,5 +176,97 @@ func TestSDAMErrorHandling(t *testing.T) {
173176
assert.False(mt, isPoolCleared(), "expected pool to not be cleared but was")
174177
})
175178
})
179+
mt.RunOpts("server errors", noClientOpts, func(mt *mtest.T) {
180+
// Integration tests for the SDAM error handling code path for errors in server response documents. These
181+
// errors can be part of the top-level document in ok:0 responses or in a nested writeConcernError document.
182+
183+
// On 4.4, some state change errors include a topologyVersion field. Because we're triggering these errors
184+
// via failCommand, the topologyVersion does not actually change as it would in an actual state change.
185+
// This causes the SDAM error handling code path to think we've already handled this state change and
186+
// ignore the error because it's stale. To avoid this altogether, we cap the test to <= 4.2.
187+
serverErrorsMtOpts := baseMtOpts().
188+
MinServerVersion("4.0"). // failCommand support
189+
MaxServerVersion("4.2").
190+
ClientOptions(baseClientOpts().SetRetryWrites(false))
191+
192+
testCases := []struct {
193+
name string
194+
errorCode int32
195+
196+
// For shutdown errors, the pool is always cleared. For non-shutdown errors, the pool is only cleared
197+
// for pre-4.2 servers.
198+
isShutdownError bool
199+
}{
200+
// "node is recovering" errors
201+
{"InterruptedAtShutdown", 11600, true},
202+
{"InterruptedDueToReplStateChange, not shutdown", 11602, false},
203+
{"NotMasterOrSecondary", 13436, false},
204+
{"PrimarySteppedDown", 189, false},
205+
{"ShutdownInProgress", 91, true},
206+
207+
// "not master" errors
208+
{"NotMaster", 10107, false},
209+
{"NotMasterNoSlaveOk", 13435, false},
210+
}
211+
for _, tc := range testCases {
212+
mt.RunOpts(fmt.Sprintf("command error - %s", tc.name), serverErrorsMtOpts, func(mt *mtest.T) {
213+
clearPoolChan()
214+
215+
// Cause the next insert to fail with an ok:0 response.
216+
fp := mtest.FailPoint{
217+
ConfigureFailPoint: "failCommand",
218+
Mode: mtest.FailPointMode{
219+
Times: 1,
220+
},
221+
Data: mtest.FailPointData{
222+
FailCommands: []string{"insert"},
223+
ErrorCode: tc.errorCode,
224+
},
225+
}
226+
mt.SetFailPoint(fp)
227+
228+
runServerErrorsTest(mt, tc.isShutdownError)
229+
})
230+
mt.RunOpts(fmt.Sprintf("write concern error - %s", tc.name), serverErrorsMtOpts, func(mt *mtest.T) {
231+
clearPoolChan()
232+
233+
// Cause the next insert to fail with a write concern error.
234+
fp := mtest.FailPoint{
235+
ConfigureFailPoint: "failCommand",
236+
Mode: mtest.FailPointMode{
237+
Times: 1,
238+
},
239+
Data: mtest.FailPointData{
240+
FailCommands: []string{"insert"},
241+
WriteConcernError: &mtest.WriteConcernErrorData{
242+
Code: tc.errorCode,
243+
},
244+
},
245+
}
246+
mt.SetFailPoint(fp)
247+
248+
runServerErrorsTest(mt, tc.isShutdownError)
249+
})
250+
}
251+
})
176252
})
177253
}
254+
255+
func runServerErrorsTest(mt *mtest.T, isShutdownError bool) {
256+
mt.Helper()
257+
258+
_, err := mt.Coll.InsertOne(mtest.Background, bson.D{{"x", 1}})
259+
assert.NotNil(mt, err, "expected InsertOne error, got nil")
260+
261+
// The pool should always be cleared for shutdown errors, regardless of server version.
262+
if isShutdownError {
263+
assert.True(mt, isPoolCleared(), "expected pool to be cleared, but was not")
264+
return
265+
}
266+
267+
// For non-shutdown errors, the pool is only cleared if the error is from a pre-4.2 server.
268+
wantCleared := mtest.CompareServerVersions(mt.ServerVersion(), "4.2") < 0
269+
gotCleared := isPoolCleared()
270+
assert.Equal(mt, wantCleared, gotCleared, "expected pool to be cleared: %v; pool was cleared: %v",
271+
wantCleared, gotCleared)
272+
}

x/mongo/driver/topology/server.go

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,22 @@ func (s *Server) RequestImmediateCheck() {
332332
}
333333
}
334334

335+
// getWriteConcernErrorForProcessing extracts a driver.WriteConcernError from the provided error. This function returns
336+
// (error, true) if the error is a WriteConcernError and the falls under the requirements for SDAM error
337+
// handling and (nil, false) otherwise.
338+
func getWriteConcernErrorForProcessing(err error) (*driver.WriteConcernError, bool) {
339+
writeCmdErr, ok := err.(driver.WriteCommandError)
340+
if !ok {
341+
return nil, false
342+
}
343+
344+
wcerr := writeCmdErr.WriteConcernError
345+
if wcerr != nil && (wcerr.NodeIsRecovering() || wcerr.NotMaster()) {
346+
return wcerr, true
347+
}
348+
return nil, false
349+
}
350+
335351
// ProcessError handles SDAM error handling and implements driver.ErrorProcessor.
336352
func (s *Server) ProcessError(err error, conn driver.Connection) {
337353
// ignore nil error
@@ -365,7 +381,7 @@ func (s *Server) ProcessError(err error, conn driver.Connection) {
365381
}
366382
return
367383
}
368-
if wcerr, ok := err.(driver.WriteConcernError); ok && (wcerr.NodeIsRecovering() || wcerr.NotMaster()) {
384+
if wcerr, ok := getWriteConcernErrorForProcessing(err); ok {
369385
// ignore stale error
370386
if description.CompareTopologyVersion(desc.TopologyVersion, wcerr.TopologyVersion) >= 0 {
371387
return

x/mongo/driver/topology/server_test.go

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -205,13 +205,15 @@ func TestServer(t *testing.T) {
205205
s.connectionstate = connected
206206
s.pool.connected = connected
207207

208-
wce := driver.WriteConcernError{
209-
Name: "",
210-
Code: 10107,
211-
Message: "not master",
212-
Details: []byte{},
213-
Labels: []string{},
214-
TopologyVersion: nil,
208+
wce := driver.WriteCommandError{
209+
WriteConcernError: &driver.WriteConcernError{
210+
Name: "",
211+
Code: 10107,
212+
Message: "not master",
213+
Details: []byte{},
214+
Labels: []string{},
215+
TopologyVersion: nil,
216+
},
215217
}
216218
s.ProcessError(wce, initConnection{})
217219

0 commit comments

Comments
 (0)