|
169 | 169 | "mpi": {"enabled": True, "custom_mpi_options": "options", "processes_per_host": 2}
|
170 | 170 | }
|
171 | 171 | DISTRIBUTION_SM_DDP_ENABLED = {
|
172 |
| - "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "options"}} |
| 172 | + "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "options"}}, |
| 173 | + "torch_distributed": {"enabled": False} |
| 174 | +} |
| 175 | +DISTRIBUTION_SM_DDP_DISABLED = { |
| 176 | + "smdistributed": {"enabled": True}, |
| 177 | + "torch_distributed": {"enabled": False} |
| 178 | +} |
| 179 | +DISTRIBUTION_SM_TORCH_DIST_AND_DDP_ENABLED = { |
| 180 | + "smdistributed": {"dataparallel": {"enabled": True, "custom_mpi_options": "options"}}, |
| 181 | + "torch_distributed": {"enabled": True} |
| 182 | +} |
| 183 | +DISTRIBUTION_SM_TORCH_DIST_AND_DDP_DISABLED = { |
| 184 | + "smdistributed": {"enabled": True}, |
| 185 | + "torch_distributed": {"enabled": True} |
173 | 186 | }
|
174 | 187 | MOCKED_S3_URI = "s3://mocked_s3_uri_from_source_dir"
|
175 | 188 | _DEFINITION_CONFIG = PipelineDefinitionConfig(use_custom_job_prefix=False)
|
@@ -309,6 +322,60 @@ def training_job_description(sagemaker_session):
|
309 | 322 | sagemaker_session.describe_training_job = mock_describe_training_job
|
310 | 323 | return returned_job_description
|
311 | 324 |
|
| 325 | +def test_validate_smdistributed_p5_raises(sagemaker_session): |
| 326 | + # supported DLC image |
| 327 | + f = DummyFramework( |
| 328 | + "some_script.py", |
| 329 | + role="DummyRole", |
| 330 | + instance_type="ml.p5.48xlarge", |
| 331 | + sagemaker_session=sagemaker_session, |
| 332 | + output_path="outputpath", |
| 333 | + image_uri="some_acceptable_image" |
| 334 | + ) |
| 335 | + #both fail because instance type is p5 and torch_distributed is off |
| 336 | + with pytest.raises(ValueError): |
| 337 | + f._distribution_configuration(DISTRIBUTION_SM_DDP_ENABLED) |
| 338 | + with pytest.raises(ValueError): |
| 339 | + f._distribution_configuration(DISTRIBUTION_SM_DDP_DISABLED) |
| 340 | + # unsupported DLC image |
| 341 | + f = DummyFramework( |
| 342 | + "some_script.py", |
| 343 | + role="DummyRole", |
| 344 | + instance_type="ml.p5.48xlarge", |
| 345 | + sagemaker_session=sagemaker_session, |
| 346 | + output_path="outputpath", |
| 347 | + image_uri="ecr-url/2.0.1-gpu-py310-cu121-ubuntu20.04-sagemaker-pr-3303" |
| 348 | + ) |
| 349 | + #both fail due to unsupported CUDA12 DLC image |
| 350 | + with pytest.raises(ValueError): |
| 351 | + f._distribution_configuration(DISTRIBUTION_SM_DDP_ENABLED) |
| 352 | + with pytest.raises(ValueError): |
| 353 | + f._distribution_configuration(DISTRIBUTION_SM_DDP_DISABLED) |
| 354 | + |
| 355 | +def test_validate_smdistributed_p5_not_raises(sagemaker_session): |
| 356 | + f = DummyFramework( |
| 357 | + "some_script.py", |
| 358 | + role="DummyRole", |
| 359 | + instance_type="ml.p5.48xlarge", |
| 360 | + sagemaker_session=sagemaker_session, |
| 361 | + output_path="outputpath", |
| 362 | + image_uri="ecr-url/2.0.1-gpu-py310-cu121-ubuntu20.04-sagemaker-pr-3303" |
| 363 | + ) |
| 364 | + #testing with p5 instance and torch_distributed enabled |
| 365 | + f._distribution_configuration(DISTRIBUTION_SM_TORCH_DIST_AND_DDP_ENABLED) |
| 366 | + f._distribution_configuration(DISTRIBUTION_SM_TORCH_DIST_AND_DDP_DISABLED) |
| 367 | + f = DummyFramework( |
| 368 | + "some_script.py", |
| 369 | + role="DummyRole", |
| 370 | + instance_type="ml.p4.24xlarge", |
| 371 | + sagemaker_session=sagemaker_session, |
| 372 | + output_path="outputpath", |
| 373 | + image_uri="some_acceptable_image" |
| 374 | + ) |
| 375 | + #testing backwards compatability with p4d instances |
| 376 | + f._distribution_configuration(DISTRIBUTION_SM_TORCH_DIST_AND_DDP_ENABLED) |
| 377 | + f._distribution_configuration(DISTRIBUTION_SM_TORCH_DIST_AND_DDP_DISABLED) |
| 378 | + |
312 | 379 |
|
313 | 380 | def test_framework_all_init_args(sagemaker_session):
|
314 | 381 | f = DummyFramework(
|
|
0 commit comments