[fix] Ensure we check deepspeed/sharded in multinode DDP (#6297)

SeanNaren · web-flow · commit 80019874e572 · 2021-03-02T13:36:18.000Z
* Ensure we check deepspeed/sharded in multinode

* Add CHANGELOG.md

* Add CHANGELOG.md

* Drop mock, use actual multi-gpu node
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -86,6 +86,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed duplicate logs appearing in console when using the python logging module ([#5509](https://github.com/PyTorchLightning/pytorch-lightning/pull/5509), [#6275](https://github.com/PyTorchLightning/pytorch-lightning/pull/6275))
 
 
+- Fixed error thrown when using valid distributed mode in multi node ([#6297](https://github.com/PyTorchLightning/pytorch-lightning/pull/6297)
+
+
 ## [1.2.1] - 2021-02-23
 
 ### Fixed
diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -536,12 +536,12 @@ def set_distributed_mode(self, distributed_backend: Optional[str] = None):
         if self.distributed_backend == "horovod":
             self._set_horovod_backend()
 
-        # throw error to force user ddp or ddp2 choice
-        _ddp = (DistributedType.DDP, DistributedType.DDP_SPAWN, DistributedType.DDP2)
-        if (self.num_nodes > 1 and self._distrib_type not in _ddp):
+        using_valid_distributed = self.use_ddp or self.use_ddp2
+        if self.num_nodes > 1 and not using_valid_distributed:
+            # throw error to force user to choose a supported distributed type such as ddp or ddp2
             raise MisconfigurationException(
-                'DataParallel does not support num_nodes > 1. Switching to DistributedDataParallel for you. '
-                'To silence this warning set `accelerator="ddp"` or `accelerator="ddp2"`'
+                'Your chosen distributed type does not support num_nodes > 1. '
+                'Please set accelerator=ddp or accelerator=ddp2.'
             )
 
         rank_zero_info(f'GPU available: {torch.cuda.is_available()}, used: {self._device_type == DeviceType.GPU}')
diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
@@ -28,10 +28,13 @@
     DDPPlugin,
     DDPShardedPlugin,
     DDPSpawnPlugin,
+    DDPSpawnShardedPlugin,
+    DeepSpeedPlugin,
     PrecisionPlugin,
     SingleDevicePlugin,
 )
 from pytorch_lightning.plugins.environments import ClusterEnvironment, SLURMEnvironment, TorchElasticEnvironment
+from pytorch_lightning.utilities import _DEEPSPEED_AVAILABLE
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from tests.helpers.boring_model import BoringModel
 from tests.helpers.runif import RunIf
@@ -415,3 +418,26 @@ def test_plugin_accelerator_choice(accelerator, plugin):
 
     trainer = Trainer(plugins=plugin, num_processes=2)
     assert isinstance(trainer.accelerator.training_type_plugin, DDPShardedPlugin)
+
+
+@pytest.mark.parametrize(["accelerator", "plugin"], [
+    ('ddp', DDPPlugin),
+    ('ddp_spawn', DDPSpawnPlugin),
+    ('ddp_sharded', DDPShardedPlugin),
+    ('ddp_sharded_spawn', DDPSpawnShardedPlugin),
+    pytest.param(
+        'deepspeed',
+        DeepSpeedPlugin,
+        marks=pytest.mark.skipif(not _DEEPSPEED_AVAILABLE, reason="DeepSpeed not available.")
+    ),
+])
+@mock.patch('torch.cuda.is_available', return_value=True)
+@mock.patch('torch.cuda.device_count', return_value=2)
+def test_accelerator_choice_multi_node_gpu(mock_is_available, mock_device_count, accelerator, plugin, tmpdir):
+    trainer = Trainer(
+        accelerator=accelerator,
+        default_root_dir=tmpdir,
+        num_nodes=2,
+        gpus=2,
+    )
+    assert isinstance(trainer.training_type_plugin, plugin)