Skip to content

fix: clone distribution in validate_distribution #4205

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 25 additions & 22 deletions src/sagemaker/fw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,11 @@
import shutil
import tempfile
from collections import namedtuple
from typing import Optional, Union, Dict
from typing import List, Optional, Union, Dict
from packaging import version

import sagemaker.image_uris
from sagemaker.instance_group import InstanceGroup
from sagemaker.s3_utils import s3_path_join
from sagemaker.session_settings import SessionSettings
import sagemaker.utils
Expand Down Expand Up @@ -828,14 +829,14 @@ def _validate_smdataparallel_args(


def validate_distribution(
distribution,
instance_groups,
framework_name,
framework_version,
py_version,
image_uri,
kwargs,
):
distribution: Dict,
instance_groups: List[InstanceGroup],
framework_name: str,
framework_version: str,
py_version: str,
image_uri: str,
kwargs: Dict,
) -> Dict:
"""Check if distribution strategy is correctly invoked by the user.

Currently, check for `dataparallel`, `modelparallel` and heterogeneous cluster set up.
Expand Down Expand Up @@ -872,7 +873,9 @@ def validate_distribution(
strategy-specific inputs are incorrect/unsupported or
heterogeneous cluster set up is incorrect
"""
train_instance_groups = distribution.get("instance_groups", [])
validated_distribution = dict(distribution)

train_instance_groups = validated_distribution.get("instance_groups", [])
if instance_groups is None:
if len(train_instance_groups) >= 1:
# if estimator's instance_groups is not defined but
Expand Down Expand Up @@ -902,77 +905,77 @@ def validate_distribution(
instance_type = train_instance_group.instance_type
validate_distribution_for_instance_type(
instance_type=instance_type,
distribution=distribution,
distribution=validated_distribution,
)
validate_smdistributed(
instance_type=instance_type,
framework_name=framework_name,
framework_version=framework_version,
py_version=py_version,
distribution=distribution,
distribution=validated_distribution,
image_uri=image_uri,
)
if framework_name and framework_name == "pytorch":
# We need to validate only for PyTorch framework
validate_pytorch_distribution(
distribution=distribution,
distribution=validated_distribution,
framework_name=framework_name,
framework_version=framework_version,
py_version=py_version,
image_uri=image_uri,
)
validate_torch_distributed_distribution(
instance_type=instance_type,
distribution=distribution,
distribution=validated_distribution,
framework_version=framework_version,
py_version=py_version,
image_uri=image_uri,
entry_point=kwargs["entry_point"],
)
warn_if_parameter_server_with_multi_gpu(
training_instance_type=instance_type, distribution=distribution
training_instance_type=instance_type, distribution=validated_distribution
)
# get instance group names
instance_group_names.append(train_instance_group.instance_group_name)
distribution["instance_groups"] = instance_group_names
validated_distribution["instance_groups"] = instance_group_names
else:
# in this case, we are handling a normal training job (without heterogeneous cluster)
instance_type = renamed_kwargs(
"train_instance_type", "instance_type", kwargs.get("instance_type"), kwargs
)
validate_distribution_for_instance_type(
instance_type=instance_type,
distribution=distribution,
distribution=validated_distribution,
)
validate_smdistributed(
instance_type=instance_type,
framework_name=framework_name,
framework_version=framework_version,
py_version=py_version,
distribution=distribution,
distribution=validated_distribution,
image_uri=image_uri,
)
if framework_name and framework_name == "pytorch":
# We need to validate only for PyTorch framework
validate_pytorch_distribution(
distribution=distribution,
distribution=validated_distribution,
framework_name=framework_name,
framework_version=framework_version,
py_version=py_version,
image_uri=image_uri,
)
validate_torch_distributed_distribution(
instance_type=instance_type,
distribution=distribution,
distribution=validated_distribution,
framework_version=framework_version,
py_version=py_version,
image_uri=image_uri,
entry_point=kwargs["entry_point"],
)
warn_if_parameter_server_with_multi_gpu(
training_instance_type=instance_type, distribution=distribution
training_instance_type=instance_type, distribution=validated_distribution
)
return distribution
return validated_distribution


def validate_distribution_for_instance_type(instance_type, distribution):
Expand Down
22 changes: 22 additions & 0 deletions tests/unit/test_fw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -784,6 +784,28 @@ def test_validate_distribution_raises():
)


def test_validate_distribution_copy():
train_group = InstanceGroup("train_group", "ml.p3.16xlarge", 1)
instance_groups = [train_group]
framework = "tensorflow"
distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
validated = fw_utils.validate_distribution(
distribution,
instance_groups,
framework,
None,
None,
"custom-container",
{"entry_point": "train.py"},
)

assert validated == {
"instance_groups": ["train_group"],
"smdistributed": {"dataparallel": {"enabled": True}},
}
assert validated is not distribution


def test_validate_smdistributed_not_raises():
smdataparallel_enabled = {"smdistributed": {"dataparallel": {"enabled": True}}}
smdataparallel_enabled_custom_mpi = {
Expand Down