|
38 | 38 | SCRIPT = "mnist.py"
|
39 | 39 | PARAMETER_SERVER_DISTRIBUTION = {"parameter_server": {"enabled": True}}
|
40 | 40 | MPI_DISTRIBUTION = {"mpi": {"enabled": True}}
|
| 41 | +MWMS_DISTRIBUTION = {"multi_worker_mirrored_strategy": {"enabled": True}} |
41 | 42 | TAGS = [{"Key": "some-key", "Value": "some-value"}]
|
42 | 43 | ENV_INPUT = {"env_key1": "env_val1", "env_key2": "env_val2", "env_key3": "env_val3"}
|
43 | 44 |
|
@@ -181,6 +182,79 @@ def test_server_side_encryption(sagemaker_session, tf_full_version, tf_full_py_v
|
181 | 182 | )
|
182 | 183 |
|
183 | 184 |
|
| 185 | +@pytest.mark.release |
| 186 | +@pytest.mark.skipif( |
| 187 | + tests.integ.test_region() in tests.integ.TRAINING_NO_P2_REGIONS |
| 188 | + and tests.integ.test_region() in tests.integ.TRAINING_NO_P3_REGIONS, |
| 189 | + reason="no ml.p2 or ml.p3 instances in this region", |
| 190 | +) |
| 191 | +@retry_with_instance_list(gpu_list(tests.integ.test_region())) |
| 192 | +def test_mwms_gpu( |
| 193 | + sagemaker_session, |
| 194 | + tensorflow_training_latest_version, |
| 195 | + tensorflow_training_latest_py_version, |
| 196 | + capsys, |
| 197 | + imagenet_train_subset, |
| 198 | + **kwargs, |
| 199 | +): |
| 200 | + epochs = 1 |
| 201 | + global_batch_size = 64 |
| 202 | + train_steps = int(10**4 * epochs / global_batch_size) |
| 203 | + steps_per_loop = train_steps // 10 |
| 204 | + overrides = ( |
| 205 | + f"runtime.enable_xla=False," |
| 206 | + f"runtime.num_gpus=1," |
| 207 | + f"runtime.distribution_strategy=multi_worker_mirrored," |
| 208 | + f"runtime.mixed_precision_dtype=float16," |
| 209 | + f"task.train_data.global_batch_size={global_batch_size}," |
| 210 | + f"task.train_data.input_path=/opt/ml/input/data/training/train-000*," |
| 211 | + f"task.train_data.cache=True," |
| 212 | + f"trainer.train_steps={train_steps}," |
| 213 | + f"trainer.steps_per_loop={steps_per_loop}," |
| 214 | + f"trainer.summary_interval={steps_per_loop}," |
| 215 | + f"trainer.checkpoint_interval={train_steps}," |
| 216 | + f"task.model.backbone.type=resnet," |
| 217 | + f"task.model.backbone.resnet.model_id=50" |
| 218 | + ) |
| 219 | + estimator = TensorFlow( |
| 220 | + git_config={ |
| 221 | + "repo": "https://github.com/tensorflow/models.git", |
| 222 | + "branch": "v2.9.2", |
| 223 | + }, |
| 224 | + source_dir=".", |
| 225 | + entry_point="official/vision/train.py", |
| 226 | + model_dir=False, |
| 227 | + instance_type=kwargs["instance_type"], |
| 228 | + instance_count=2, |
| 229 | + framework_version=tensorflow_training_latest_version, |
| 230 | + py_version=tensorflow_training_latest_py_version, |
| 231 | + distribution=MWMS_DISTRIBUTION, |
| 232 | + hyperparameters={ |
| 233 | + "experiment": "resnet_imagenet", |
| 234 | + "config_file": "official/vision/configs/experiments/image_classification/imagenet_resnet50_gpu.yaml", |
| 235 | + "mode": "train", |
| 236 | + "model_dir": "/opt/ml/model", |
| 237 | + "params_override": overrides, |
| 238 | + }, |
| 239 | + environment={ |
| 240 | + "NCCL_DEBUG": "INFO", |
| 241 | + }, |
| 242 | + max_run=60 * 60 * 1, # 1 hour |
| 243 | + role=ROLE, |
| 244 | + volume_size=400, |
| 245 | + sagemaker_session=sagemaker_session, |
| 246 | + disable_profiler=True, |
| 247 | + ) |
| 248 | + |
| 249 | + with tests.integ.timeout.timeout(minutes=tests.integ.TRAINING_DEFAULT_TIMEOUT_MINUTES): |
| 250 | + estimator.fit(inputs=imagenet_train_subset, job_name=unique_name_from_base("test-tf-mwms")) |
| 251 | + |
| 252 | + captured = capsys.readouterr() |
| 253 | + logs = captured.out + captured.err |
| 254 | + assert "Running distributed training job with multi_worker_mirrored_strategy setup" in logs |
| 255 | + raise NotImplementedError("Check model saving") |
| 256 | + |
| 257 | + |
184 | 258 | @pytest.mark.release
|
185 | 259 | def test_mnist_distributed_cpu(
|
186 | 260 | sagemaker_session,
|
|
0 commit comments