Switching to a simpler test for keras examples for MWMS

Lokiiiiii · Lokiiiiii · commit bc9edde1f66a · 2023-02-09T18:06:25.000Z
diff --git a/tests/data/tensorflow_mnist/mnist_mwms.py b/tests/data/tensorflow_mnist/mnist_mwms.py
@@ -0,0 +1,55 @@
+# https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras
+
+import json
+import os
+import tensorflow as tf
+import numpy as np
+
+
+def mnist_dataset(batch_size):
+    (x_train, y_train), _ = tf.keras.datasets.mnist.load_data()
+    # The `x` arrays are in uint8 and have values in the [0, 255] range.
+    # You need to convert them to float32 with values in the [0, 1] range.
+    x_train = x_train / np.float32(255)
+    y_train = y_train.astype(np.int64)
+    train_dataset = (
+        tf.data.Dataset.from_tensor_slices((x_train, y_train))
+        .shuffle(60000)
+        .repeat()
+        .batch(batch_size)
+    )
+    return train_dataset
+
+
+def build_and_compile_cnn_model():
+    model = tf.keras.Sequential(
+        [
+            tf.keras.layers.InputLayer(input_shape=(28, 28)),
+            tf.keras.layers.Reshape(target_shape=(28, 28, 1)),
+            tf.keras.layers.Conv2D(32, 3, activation="relu"),
+            tf.keras.layers.Flatten(),
+            tf.keras.layers.Dense(128, activation="relu"),
+            tf.keras.layers.Dense(10),
+        ]
+    )
+    model.compile(
+        loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=tf.keras.optimizers.SGD(learning_rate=0.001),
+        metrics=["accuracy"],
+    )
+    return model
+
+
+per_worker_batch_size = 64
+tf_config = json.loads(os.environ["TF_CONFIG"])
+num_workers = len(tf_config["cluster"]["worker"])
+
+strategy = tf.distribute.MultiWorkerMirroredStrategy()
+
+global_batch_size = per_worker_batch_size * num_workers
+multi_worker_dataset = mnist_dataset(global_batch_size)
+
+with strategy.scope():
+    multi_worker_model = build_and_compile_cnn_model()
+
+multi_worker_model.fit(multi_worker_dataset, epochs=3, steps_per_epoch=70)
diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py
@@ -189,30 +189,14 @@ def test_mwms_gpu(
 ):
     instance_count = 2
     estimator = TensorFlow(
-        source_dir=os.path.join(RESOURCE_PATH, "huggingface", "run_mlm"),
-        entry_point="run_mlm.py",
+        source_dir=os.path.join(RESOURCE_PATH, "tensorflow_mnist"),
+        entry_point="mnist_mwms.py",
         model_dir=False,
         instance_type=kwargs["instance_type"],
         instance_count=instance_count,
         framework_version=tensorflow_training_latest_version,
         py_version=tensorflow_training_latest_py_version,
         distribution=MWMS_DISTRIBUTION,
-        hyperparameters={
-            "model_name_or_path": "bert-base-uncased",
-            "output_dir": "/opt/ml/model",
-            "dataset_name": "glue",
-            "dataset_config_name": "sst2",
-            "do_train": True,
-            "do_eval": False,
-            "max_seq_length": 128,
-            "num_train_epochs": 1,
-            "max_steps": 16,
-            "overwrite_output_dir": True,
-            "save_strategy": "no",
-            "evaluation_strategy": "no",
-            "logging_strategy": "epoch",
-            "per_device_train_batch_size": 16,
-        },
         environment={"NCCL_DEBUG": "INFO"},
         max_run=60 * 60 * 1,  # 1 hour
         role=ROLE,