Fix has_passed_steps and tests for has-passed-steps with mode steps (aws#147)

NihalHarish · Vikas-kum · commit 2ada1d41d8b6 · 2020-01-16T15:46:06.000-08:00
* tests for has-passed-step-tests
diff --git a/smdebug/trials/trial.py b/smdebug/trials/trial.py
@@ -517,11 +517,13 @@ def has_passed_step(self, step, mode=ModeKeys.GLOBAL) -> StepState:
         """
         all_steps = self.steps(mode=mode, show_incomplete_steps=True)
         bisect_idx = bisect_left(all_steps, step)
-        g_step = self._global_step_currently(mode, step)
 
         if bisect_idx < len(all_steps):
+            # This returns either the global step corresponding to the mode-step
+            # or the closest global step that is greater than the step passed as a parameter
+            g_step = self._global_step_currently(mode, all_steps[bisect_idx])
             if all_steps[bisect_idx] > step:
-                if self.last_complete_step > g_step:
+                if self.last_complete_step >= g_step:
                     return StepState.UNAVAILABLE
                 return StepState.NOT_YET_AVAILABLE
             elif all_steps[bisect_idx] == step:
diff --git a/tests/analysis/trials/test_has_passed_step_scenarios.py b/tests/analysis/trials/test_has_passed_step_scenarios.py
@@ -1,15 +1,53 @@
 # Standard Library
+import json
+import os
+import shutil
+import uuid
+from pathlib import Path
 
 # Third Party
 import pytest
 
 # First Party
+from smdebug.core.collection_manager import CollectionManager
 from smdebug.core.config_constants import INCOMPLETE_STEP_WAIT_WINDOW_KEY
+from smdebug.core.locations import IndexFileLocationUtils
+from smdebug.core.modes import ModeKeys
 from smdebug.core.tensor import StepState
 from smdebug.exceptions import NoMoreData, StepUnavailable
 from smdebug.trials import create_trial
 
 
+def dummy_trial_creator(trial_dir, num_workers, job_ended):
+    Path(trial_dir).mkdir(parents=True, exist_ok=True)
+    cm = CollectionManager()
+    for i in range(num_workers):
+        collection_file_name = f"worker_{i}_collections.json"
+        cm.export(trial_dir, collection_file_name)
+    if job_ended:
+        Path(os.path.join(trial_dir, "training_job_end.ts")).touch()
+
+
+def dummy_step_creator(trial_dir, global_step, mode, mode_step, worker_name):
+    static_step_data = (
+        '{"meta": {"mode": "TRAIN", "mode_step": 0, "event_file_name": ""}, '
+        '"tensor_payload": ['
+        '{"tensorname": "gradients/dummy:0", "start_idx": 0, "length": 1}'
+        "]}"
+    )
+
+    step = json.loads(static_step_data)
+    step["meta"]["mode"] = mode
+    step["meta"]["mode_step"] = mode_step
+
+    index_file_location = IndexFileLocationUtils.get_index_key_for_step(
+        trial_dir, global_step, worker_name
+    )
+    Path(os.path.dirname(index_file_location)).mkdir(parents=True, exist_ok=True)
+    with open(index_file_location, "w") as f:
+        json.dump(step, f)
+
+
 @pytest.mark.slow
 def test_single_writer_all_steps_written_complete_job():
     """Test Scenario Description"
@@ -38,6 +76,112 @@ def test_single_writer_all_steps_written_complete_job():
     assert trial.last_complete_step == 6
 
 
+@pytest.mark.slow
+def test_single_writer_all_steps_written_complete_job_two_modes():
+    """Test Scenario Description"
+     workers : [a]
+     modes: TRAIN, EVAL
+     steps :{
+        0: [worker:a, mode: TRAIN, mode_step: 0],
+        10: [worker:a, mode: TRAIN, mode_step: 10],
+        20: [worker:a, mode: TRAIN, mode_step: 20],
+        30: [worker:a, mode: TRAIN, mode_step: 30],
+        40: [worker:a, mode: EVAL, mode_step: 0],
+        50: [worker:a, mode: EVAL, mode_step: 10],
+        60: [worker:a, mode: EVAL, mode_step: 20],
+        70: [worker:a, mode: EVAL, mode_step: 30]
+        }
+    END_OF_JOB.ts --> Present
+    """
+
+    path = os.path.join("ts_output/train/", str(uuid.uuid4()))
+    dummy_trial_creator(trial_dir=path, num_workers=1, job_ended=True)
+    for i in range(0, 31, 10):
+        dummy_step_creator(
+            trial_dir=path, global_step=i, mode="TRAIN", mode_step=i, worker_name="worker_0"
+        )
+
+    for i in range(0, 31, 10):
+        dummy_step_creator(
+            trial_dir=path, global_step=i + 40, mode="EVAL", mode_step=i, worker_name="worker_0"
+        )
+
+    trial = create_trial(path)
+    num_workers = len(trial.workers())
+    assert num_workers == 1
+    assert trial.loaded_all_steps is True
+    all_steps = trial.steps(show_incomplete_steps=True)
+    completed_steps = trial.steps()
+    assert all_steps == [0, 10, 20, 30, 40, 50, 60, 70]
+    assert completed_steps == all_steps
+    assert trial.has_passed_step(30) == StepState.AVAILABLE
+    assert trial.has_passed_step(23, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
+    assert trial.has_passed_step(40, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
+    assert trial.has_passed_step(30, mode=ModeKeys.EVAL) == StepState.AVAILABLE
+    assert trial.has_passed_step(23, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
+    assert trial.has_passed_step(80) == StepState.UNAVAILABLE
+    assert trial.has_passed_step(80, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
+    assert trial.has_passed_step(80, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
+    assert trial.last_index_token == os.path.join(
+        path, "index/000000000/000000000070_worker_0.json"
+    )
+    assert trial.last_complete_step == 70
+    shutil.rmtree(path, ignore_errors=True)
+
+
+@pytest.mark.slow
+def test_single_writer_all_steps_written_incomplete_job_two_modes():
+    """Test Scenario Description"
+     workers : [a]
+     modes: TRAIN, EVAL
+     steps :{
+        0: [worker:a, mode: TRAIN, mode_step: 0],
+        10: [worker:a, mode: TRAIN, mode_step: 10],
+        20: [worker:a, mode: TRAIN, mode_step: 20],
+        30: [worker:a, mode: TRAIN, mode_step: 30],
+        40: [worker:a, mode: EVAL, mode_step: 0],
+        50: [worker:a, mode: EVAL, mode_step: 10],
+        60: [worker:a, mode: EVAL, mode_step: 20],
+        70: [worker:a, mode: EVAL, mode_step: 30]
+        }
+    END_OF_JOB.ts --> Absent
+    """
+
+    path = os.path.join("ts_output/train/", str(uuid.uuid4()))
+    dummy_trial_creator(trial_dir=path, num_workers=1, job_ended=False)
+    for i in range(0, 31, 10):
+        dummy_step_creator(
+            trial_dir=path, global_step=i, mode="TRAIN", mode_step=i, worker_name="worker_0"
+        )
+
+    for i in range(0, 31, 10):
+        dummy_step_creator(
+            trial_dir=path, global_step=i + 40, mode="EVAL", mode_step=i, worker_name="worker_0"
+        )
+
+    trial = create_trial(path)
+    num_workers = len(trial.workers())
+    assert num_workers == 1
+    assert trial.loaded_all_steps is False
+    all_steps = trial.steps(show_incomplete_steps=True)
+    completed_steps = trial.steps()
+    assert all_steps == [0, 10, 20, 30, 40, 50, 60, 70]
+    assert completed_steps == all_steps
+    assert trial.has_passed_step(30) == StepState.AVAILABLE
+    assert trial.has_passed_step(23, mode=ModeKeys.TRAIN) == StepState.UNAVAILABLE
+    assert trial.has_passed_step(40, mode=ModeKeys.TRAIN) == StepState.NOT_YET_AVAILABLE
+    assert trial.has_passed_step(30, mode=ModeKeys.EVAL) == StepState.AVAILABLE
+    assert trial.has_passed_step(23, mode=ModeKeys.EVAL) == StepState.UNAVAILABLE
+    assert trial.has_passed_step(80) == StepState.NOT_YET_AVAILABLE
+    assert trial.has_passed_step(80, mode=ModeKeys.TRAIN) == StepState.NOT_YET_AVAILABLE
+    assert trial.has_passed_step(80, mode=ModeKeys.EVAL) == StepState.NOT_YET_AVAILABLE
+    assert trial.last_index_token == os.path.join(
+        path, "index/000000000/000000000070_worker_0.json"
+    )
+    assert trial.last_complete_step == 70
+    shutil.rmtree(path, ignore_errors=True)
+
+
 @pytest.mark.slow
 def test_single_writer_all_steps_written_incomplete_job():
     """Test Scenario Description"