@@ -34,6 +34,11 @@ def deepspeed_config():
34
34
}
35
35
36
36
37
+ @pytest .fixture
38
+ def deepspeed_zero_config (deepspeed_config ):
39
+ return {** deepspeed_config , 'zero_allow_untested_optimizer' : True , 'zero_optimization' : {'stage' : 2 }}
40
+
41
+
37
42
@pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
38
43
def test_deepspeed_plugin_string (tmpdir ):
39
44
"""
@@ -165,9 +170,6 @@ def test_invalid_deepspeed_defaults_no_precision(tmpdir):
165
170
166
171
@pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
167
172
@pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
168
- @pytest .mark .skipif (
169
- not os .getenv ("PL_RUNNING_SPECIAL_TESTS" , '0' ) == '1' , reason = "test should be run outside of pytest"
170
- )
171
173
def test_warn_deepspeed_override_backward (tmpdir ):
172
174
"""
173
175
Test to ensure that if the backward hook in the LightningModule is overridden, we throw a warning.
@@ -191,9 +193,6 @@ def backward(self, loss: Tensor, optimizer: Optimizer, optimizer_idx: int, *args
191
193
192
194
@pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
193
195
@pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
194
- @pytest .mark .skipif (
195
- not os .getenv ("PL_RUNNING_SPECIAL_TESTS" , '0' ) == '1' , reason = "test should be run outside of pytest"
196
- )
197
196
def test_deepspeed_run_configure_optimizers (tmpdir ):
198
197
"""
199
198
Test end to end that deepspeed works with defaults (without ZeRO as that requires compilation),
@@ -223,10 +222,7 @@ def on_train_start(self) -> None:
223
222
224
223
@pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
225
224
@pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
226
- @pytest .mark .skipif (
227
- not os .getenv ("PL_RUNNING_SPECIAL_TESTS" , '0' ) == '1' , reason = "test should be run outside of pytest"
228
- )
229
- def test_deepspeed_config (tmpdir , deepspeed_config ):
225
+ def test_deepspeed_config (tmpdir , deepspeed_zero_config ):
230
226
"""
231
227
Test to ensure deepspeed works correctly when passed a DeepSpeed config object including optimizers/schedulers
232
228
and saves the model weights to load correctly.
@@ -255,6 +251,58 @@ def on_train_start(self) -> None:
255
251
_assert_save_model_is_equal (model , tmpdir , trainer )
256
252
257
253
254
+ @pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
255
+ @pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
256
+ def test_deepspeed_custom_precision_params (tmpdir ):
257
+ """
258
+ Ensure if we modify the FP16 parameters via the DeepSpeedPlugin, the deepspeed config contains these changes.
259
+ """
260
+
261
+ class TestModel (BoringModel ):
262
+
263
+ def on_train_start (self ) -> None :
264
+ assert self .trainer .training_type_plugin .config ['fp16' ]['loss_scale' ] == 10
265
+ assert self .trainer .training_type_plugin .config ['fp16' ]['initial_scale_power' ] == 10
266
+ assert self .trainer .training_type_plugin .config ['fp16' ]['loss_scale_window' ] == 10
267
+ assert self .trainer .training_type_plugin .config ['fp16' ]['hysteresis' ] == 10
268
+ assert self .trainer .training_type_plugin .config ['fp16' ]['min_loss_scale' ] == 10
269
+ raise SystemExit ()
270
+
271
+ model = TestModel ()
272
+ trainer = Trainer (
273
+ plugins = [
274
+ DeepSpeedPlugin (
275
+ loss_scale = 10 , initial_scale_power = 10 , loss_scale_window = 10 , hysteresis = 10 , min_loss_scale = 10
276
+ )
277
+ ],
278
+ precision = 16 ,
279
+ gpus = 1
280
+ )
281
+ with pytest .raises (SystemExit ):
282
+ trainer .fit (model )
283
+
284
+
285
+ @pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
286
+ @pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
287
+ def test_deepspeed_assert_config_zero_offload_disabled (tmpdir , deepspeed_zero_config ):
288
+ """
289
+ Ensure if we use a config and turn off cpu_offload, that this is set to False within the config.
290
+ """
291
+
292
+ deepspeed_zero_config ['zero_optimization' ]['cpu_offload' ] = False
293
+
294
+ class TestModel (BoringModel ):
295
+
296
+ def on_train_start (self ) -> None :
297
+ assert self .trainer .training_type_plugin .config ['zero_optimization' ]['cpu_offload' ] is False
298
+ raise SystemExit ()
299
+
300
+ model = TestModel ()
301
+ trainer = Trainer (plugins = [DeepSpeedPlugin (config = deepspeed_zero_config )], precision = 16 , gpus = 1 )
302
+ with pytest .raises (SystemExit ):
303
+ trainer .fit (model )
304
+
305
+
258
306
@pytest .mark .skipif (not torch .cuda .is_available (), reason = "requires GPU machine" )
259
307
@pytest .mark .skipif (not _DEEPSPEED_AVAILABLE , reason = "DeepSpeed not available." )
260
308
@pytest .mark .skipif (torch .cuda .device_count () < 2 , reason = "test requires multi-GPU machine" )
0 commit comments