|
169 | 169 | "mpi": {"enabled": True, "custom_mpi_options": "options", "processes_per_host": 2}
|
170 | 170 | }
|
171 | 171 | DISTRIBUTION_SM_DDP_ENABLED = {
|
172 |
| - "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "options"}} |
| 172 | + "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "options"}}, |
| 173 | + "torch_distributed": {"enabled": False}, |
| 174 | +} |
| 175 | +DISTRIBUTION_SM_DDP_DISABLED = { |
| 176 | + "smdistributed": {"enabled": True}, |
| 177 | + "torch_distributed": {"enabled": False}, |
| 178 | +} |
| 179 | +DISTRIBUTION_SM_TORCH_DIST_AND_DDP_ENABLED = { |
| 180 | + "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "options"}}, |
| 181 | + "torch_distributed": {"enabled": True}, |
| 182 | +} |
| 183 | +DISTRIBUTION_SM_TORCH_DIST_AND_DDP_DISABLED = { |
| 184 | + "smdistributed": {"enabled": True}, |
| 185 | + "torch_distributed": {"enabled": True}, |
173 | 186 | }
|
174 | 187 | MOCKED_S3_URI = "s3://mocked_s3_uri_from_source_dir"
|
175 | 188 | _DEFINITION_CONFIG = PipelineDefinitionConfig(use_custom_job_prefix=False)
|
@@ -310,6 +323,85 @@ def training_job_description(sagemaker_session):
|
310 | 323 | return returned_job_description
|
311 | 324 |
|
312 | 325 |
|
| 326 | +def test_validate_smdistributed_unsupported_image_raises(sagemaker_session): |
| 327 | + # Test unsupported image raises error. |
| 328 | + for unsupported_image in DummyFramework.UNSUPPORTED_DLC_IMAGE_FOR_SM_PARALLELISM: |
| 329 | + # Fail due to unsupported CUDA12 DLC image. |
| 330 | + f = DummyFramework( |
| 331 | + "some_script.py", |
| 332 | + role="DummyRole", |
| 333 | + instance_type="ml.p4d.24xlarge", |
| 334 | + sagemaker_session=sagemaker_session, |
| 335 | + output_path="outputpath", |
| 336 | + image_uri=unsupported_image, |
| 337 | + ) |
| 338 | + with pytest.raises(ValueError): |
| 339 | + f._distribution_configuration(DISTRIBUTION_SM_DDP_ENABLED) |
| 340 | + with pytest.raises(ValueError): |
| 341 | + f._distribution_configuration(DISTRIBUTION_SM_DDP_DISABLED) |
| 342 | + |
| 343 | + # Test unsupported image with suffix raises error. |
| 344 | + for unsupported_image in DummyFramework.UNSUPPORTED_DLC_IMAGE_FOR_SM_PARALLELISM: |
| 345 | + # Fail due to unsupported CUDA12 DLC image. |
| 346 | + f = DummyFramework( |
| 347 | + "some_script.py", |
| 348 | + role="DummyRole", |
| 349 | + instance_type="ml.p4d.24xlarge", |
| 350 | + sagemaker_session=sagemaker_session, |
| 351 | + output_path="outputpath", |
| 352 | + image_uri=unsupported_image + "-ubuntu20.04-sagemaker-pr-3303", |
| 353 | + ) |
| 354 | + with pytest.raises(ValueError): |
| 355 | + f._distribution_configuration(DISTRIBUTION_SM_DDP_ENABLED) |
| 356 | + with pytest.raises(ValueError): |
| 357 | + f._distribution_configuration(DISTRIBUTION_SM_DDP_DISABLED) |
| 358 | + |
| 359 | + |
| 360 | +def test_validate_smdistributed_p5_raises(sagemaker_session): |
| 361 | + # Supported DLC image. |
| 362 | + f = DummyFramework( |
| 363 | + "some_script.py", |
| 364 | + role="DummyRole", |
| 365 | + instance_type="ml.p5.48xlarge", |
| 366 | + sagemaker_session=sagemaker_session, |
| 367 | + output_path="outputpath", |
| 368 | + image_uri="some_acceptable_image", |
| 369 | + ) |
| 370 | + # Both fail because instance type is p5 and torch_distributed is off. |
| 371 | + with pytest.raises(ValueError): |
| 372 | + f._distribution_configuration(DISTRIBUTION_SM_DDP_ENABLED) |
| 373 | + with pytest.raises(ValueError): |
| 374 | + f._distribution_configuration(DISTRIBUTION_SM_DDP_DISABLED) |
| 375 | + |
| 376 | + |
| 377 | +def test_validate_smdistributed_p5_not_raises(sagemaker_session): |
| 378 | + f = DummyFramework( |
| 379 | + "some_script.py", |
| 380 | + role="DummyRole", |
| 381 | + instance_type="ml.p5.48xlarge", |
| 382 | + sagemaker_session=sagemaker_session, |
| 383 | + output_path="outputpath", |
| 384 | + image_uri="ecr-url/2.0.1-gpu-py310-cu121-ubuntu20.04-sagemaker-pr-3303", |
| 385 | + ) |
| 386 | + # Testing with p5 instance and torch_distributed enabled. |
| 387 | + f._distribution_configuration(DISTRIBUTION_SM_TORCH_DIST_AND_DDP_ENABLED) |
| 388 | + f._distribution_configuration(DISTRIBUTION_SM_TORCH_DIST_AND_DDP_DISABLED) |
| 389 | + |
| 390 | + |
| 391 | +def test_validate_smdistributed_backward_compat_p4_not_raises(sagemaker_session): |
| 392 | + f = DummyFramework( |
| 393 | + "some_script.py", |
| 394 | + role="DummyRole", |
| 395 | + instance_type="ml.p4d.24xlarge", |
| 396 | + sagemaker_session=sagemaker_session, |
| 397 | + output_path="outputpath", |
| 398 | + image_uri="some_acceptable_image", |
| 399 | + ) |
| 400 | + # Testing backwards compatability with p4d instances. |
| 401 | + f._distribution_configuration(DISTRIBUTION_SM_TORCH_DIST_AND_DDP_ENABLED) |
| 402 | + f._distribution_configuration(DISTRIBUTION_SM_TORCH_DIST_AND_DDP_DISABLED) |
| 403 | + |
| 404 | + |
313 | 405 | def test_framework_all_init_args(sagemaker_session):
|
314 | 406 | f = DummyFramework(
|
315 | 407 | "my_script.py",
|
|
0 commit comments