SPEC-1708 Increase replSetStepDown timeout and run replSetFreeze instead of retrying (#826)

ShaneHarvey · web-flow · commit 2bedff2fbd98 · 2020-07-14T19:33:45.000-07:00
Increase waitForPrimaryChange timeout to workaround slow elections on Windows.
diff --git a/source/connections-survive-step-down/tests/README.rst b/source/connections-survive-step-down/tests/README.rst
@@ -64,26 +64,16 @@ Perform the following operations:
 - Insert 5 documents into a collection with a majority write concern.
 - Start a find operation on the collection with a batch size of 2, and
   retrieve the first batch of results.
-- Send a ``{replSetStepDown: 5, force: true}`` command to the current primary and verify that
+- Send a ``{replSetFreeze: 0}`` command to any secondary and verify that the
+  command succeeded. This command will unfreeze the secondary and ensure that
+  it will be eligible to be elected immediately.
+- Send a ``{replSetStepDown: 30, force: true}`` command to the current primary and verify that
   the command succeeded.
 - Retrieve the next batch of results from the cursor obtained in the find
   operation, and verify that this operation succeeded.
 - If the driver implements the `CMAP`_ specification, verify that no new `PoolClearedEvent`_ has been
   published. Otherwise verify that `connections.totalCreated`_ in `serverStatus`_ has not changed.
 
-**Note:** The "replSetStepDown" command often fails with the following
-transient error (see `SERVER-48154`_)::
-
-  {
-    "ok" : 0,
-    "errmsg" : "Unable to acquire X lock on '{4611686018427387905: ReplicationStateTransition, 1}' within 1000ms. opId: 922, op: conn30, connId: 30.",
-    "code" : 24,
-    "codeName" : "LockTimeout",
-  }
-
-When running the "replSetStepDown" command, drivers MUST retry until the
-command succeeds. The number of retries should be limited to avoid an infinite
-failure loop. For example, the Python driver uses a 10 second retry period.
 
 Not Master - Keep Connection Pool
 `````````````````````````````````
@@ -186,4 +176,3 @@ server communication.
 .. _PoolClearedEvent: /source/connection-monitoring-and-pooling/connection-monitoring-and-pooling.rst#events
 .. _serverStatus: https://docs.mongodb.com/manual/reference/command/serverStatus
 .. _connections.totalCreated: https://docs.mongodb.com/manual/reference/command/serverStatus/#serverstatus.connections.totalCreated
-.. _SERVER-48154: https://jira.mongodb.org/browse/SERVER-48154
diff --git a/source/server-discovery-and-monitoring/tests/README.rst b/source/server-discovery-and-monitoring/tests/README.rst
@@ -304,26 +304,12 @@ MongoClient from the one used for test operations. For example::
 
       - name: runAdminCommand
         object: testRunner
-        command_name: replSetStepDown
+        command_name: replSetFreeze
         arguments:
           command:
-            replSetStepDown: 1
-            secondaryCatchUpPeriodSecs: 1
-            force: false
-
-**Note:** The "replSetStepDown" command often fails with the following
-transient error (see `SERVER-48154`_)::
-
-  {
-    "ok" : 0,
-    "errmsg" : "Unable to acquire X lock on '{4611686018427387905: ReplicationStateTransition, 1}' within 1000ms. opId: 922, op: conn30, connId: 30.",
-    "code" : 24,
-    "codeName" : "LockTimeout",
-  }
-
-When running the "replSetStepDown" command, drivers MUST retry until the
-command succeeds. The number of retries should be limited to avoid an infinite
-failure loop. For example, the Python driver uses a 10 second retry period.
+            replSetFreeze: 0
+          readPreference:
+            mode: Secondary
 
 waitForPrimaryChange
 ''''''''''''''''''''
@@ -456,4 +442,3 @@ Run the following test(s) on MongoDB 4.4+.
 .. Section for links.
 
 .. _Server Description Equality: /source/server-discovery-and-monitoring/server-discovery-and-monitoring.rst#server-description-equality
-.. _SERVER-48154: https://jira.mongodb.org/browse/SERVER-48154
diff --git a/source/server-discovery-and-monitoring/tests/integration/rediscover-quickly-after-step-down.json b/source/server-discovery-and-monitoring/tests/integration/rediscover-quickly-after-step-down.json
@@ -45,14 +45,27 @@
           "name": "recordPrimary",
           "object": "testRunner"
         },
+        {
+          "name": "runAdminCommand",
+          "object": "testRunner",
+          "command_name": "replSetFreeze",
+          "arguments": {
+            "command": {
+              "replSetFreeze": 0
+            },
+            "readPreference": {
+              "mode": "Secondary"
+            }
+          }
+        },
         {
           "name": "runAdminCommand",
           "object": "testRunner",
           "command_name": "replSetStepDown",
           "arguments": {
             "command": {
-              "replSetStepDown": 1,
-              "secondaryCatchUpPeriodSecs": 1,
+              "replSetStepDown": 30,
+              "secondaryCatchUpPeriodSecs": 30,
               "force": false
             }
           }
@@ -61,7 +74,7 @@
           "name": "waitForPrimaryChange",
           "object": "testRunner",
           "arguments": {
-            "timeoutMS": 5000
+            "timeoutMS": 15000
           }
         },
         {
diff --git a/source/server-discovery-and-monitoring/tests/integration/rediscover-quickly-after-step-down.yml b/source/server-discovery-and-monitoring/tests/integration/rediscover-quickly-after-step-down.yml
@@ -31,19 +31,30 @@ tests:
             - _id: 4
       - name: recordPrimary
         object: testRunner
+      # Unfreeze a secondary with replSetFreeze:0 to ensure a speedy election.
+      - name: runAdminCommand
+        object: testRunner
+        command_name: replSetFreeze
+        arguments:
+          command:
+            replSetFreeze: 0
+          readPreference:
+            mode: Secondary
       # Run replSetStepDown on the meta client.
       - name: runAdminCommand
         object: testRunner
         command_name: replSetStepDown
         arguments:
           command:
-            replSetStepDown: 1
-            secondaryCatchUpPeriodSecs: 1
+            replSetStepDown: 30
+            secondaryCatchUpPeriodSecs: 30
             force: false
       - name: waitForPrimaryChange
         object: testRunner
         arguments:
-          timeoutMS: 5000
+          # We use a relatively large timeout here to workaround slow
+          # elections on Windows, possibly caused by SERVER-48154.
+          timeoutMS: 15000
       # Rediscover the new primary.
       - name: insertMany
         object: collection