Running tests in parallel for RL notebooks. (#1624)

goelakash · goel-akas · web-flow · commit e4efb1f7eb93 · 2020-10-19T21:47:32.000-07:00
Co-authored-by: Akash Goel &lt;goelakas@amazon.com&gt;
diff --git a/reinforcement_learning/testnotebooks.txt b/reinforcement_learning/testnotebooks.txt
@@ -1,2 +1,2 @@
-rl_cartpole_ray/rl_cartpole_ray_gymEnv.ipynb
-rl_cartpole_coach/rl_cartpole_coach_gymEnv.ipynb
+./rl_roboschool_ray/rl_roboschool_ray.ipynb
+./rl_cartpole_ray/rl_cartpole_ray_gymEnv.ipynb
diff --git a/reinforcement_learning/testrunner.py b/reinforcement_learning/testrunner.py
@@ -1,10 +1,12 @@
 #!/usr/bin/env python
 # coding: utf-8
+from datetime import datetime
+startTime = datetime.now()
 
 import itertools
 import json
+import multiprocessing as mp
 import os
-import subprocess
 import sys
 import time
 try:
@@ -15,39 +17,53 @@
     sys.exit("""Some libraries are missing. Please install them by running `pip install -r test_requirements.txt`.""")
 
 # CONSTANTS
+manager = mp.Manager()
 TEST_NOTEBOOKS_FILE = 'testnotebooks.txt'
 TEST_CONFIG_FILE = 'testconfig.csv'
-SUCCESSES = 0
-EXCEPTIONS = 0
-SUCCESSFUL_EXECUTIONS = []
-FAILED_EXECUTIONS = []
+SUCCESSES = mp.Value('d', 0)
+EXCEPTIONS = mp.Value('d', 0)
+SUCCESSFUL_EXECUTIONS = manager.list()
+FAILED_EXECUTIONS = manager.list()
 CELL_EXECUTION_TIMEOUT_SECONDS = 1200
+ROOT = os.path.abspath('.')
+
+jobs = []
 
 
 # helper functions
-def run_notebook(nb_path, test_config):
-    dir_name = os.path.dirname(nb_path)
+
+def execute_nb_with_params(nb_path, params):
+    abs_nb_dir_path = os.path.join(ROOT, os.path.dirname(nb_path))
     nb_name = os.path.basename(nb_path)
     output_nb_name = "output_{}.ipynb".format(nb_name)
-    os.chdir(dir_name)
+    os.chdir(abs_nb_dir_path)
     print("Current directory: {}".format(os.getcwd()))
-    global SUCCESSES
-    global EXCEPTIONS
-    for i in range(len(test_config)):
-        params = json.loads(test_config.loc[i].to_json())
+    print("RUN: " + nb_name + " with parameters " + str(params))
+    # Execute notebook
+    test_case = dict({'notebook':nb_name, 'params':params})
+    try:
+        papermill.execute_notebook(nb_name, output_nb_name, parameters=params, execution_timeout=CELL_EXECUTION_TIMEOUT_SECONDS, log_output=True)
+        SUCCESSES.value += 1
+        SUCCESSFUL_EXECUTIONS.append(test_case)
+    except BaseException as error:
+        print('An exception occurred: {}'.format(error))
+        EXCEPTIONS.value += 1
+        FAILED_EXECUTIONS.append(test_case)
+    os.chdir(ROOT)
+
+
+def test_notebook(nb_path, df_test_config):
+    for i in range(len(df_test_config)):
+        params = json.loads(df_test_config.loc[i].to_json())
         # Coach notebooks support only single instance training, so skip the tests with multiple EC2 instances
-        if 'coach' in nb_name.lower() and params['train_instance_count'] > 1:
+        if 'coach' in nb_path.lower() and params['train_instance_count'] > 1:
             continue
-        print("\nTEST: " + nb_name + " with parameters " + str(params))
-        process = None
-        try:
-            papermill.execute_notebook(nb_name, output_nb_name, parameters=params, execution_timeout=CELL_EXECUTION_TIMEOUT_SECONDS, log_output=True)
-            SUCCESSES += 1
-            SUCCESSFUL_EXECUTIONS.append(dict({'notebook':nb_name, 'params':params}))
-        except BaseException as error:
-            print('An exception occurred: {}'.format(error))
-            EXCEPTIONS += 1
-            FAILED_EXECUTIONS.append(dict({'notebook':nb_name, 'params':params}))
+        p = mp.Process(target=execute_nb_with_params, args=(nb_path, params))
+        time.sleep(1)
+        jobs.append(p)
+        p.start()
+
+
 
 def print_notebook_executions(nb_list_with_params):
     # This expects a list of dict type items.
@@ -65,26 +81,30 @@ def print_notebook_executions(nb_list_with_params):
     print(tabulate(pd.DataFrame([v for v in vals], columns=keys), showindex=False))
 
 
+if __name__ == "__main__":
+    notebooks_list = open(TEST_NOTEBOOKS_FILE).readlines()
+    config = pd.read_csv(TEST_CONFIG_FILE)
+    # Run tests on each notebook listed in the config.
+    print("Test Configuration: ")
+    print(config)
 
-notebooks_list = open(TEST_NOTEBOOKS_FILE).readlines()
-config = pd.read_csv(TEST_CONFIG_FILE)
-ROOT = os.path.abspath('.')
+    for nb_path in notebooks_list:
+        print("Testing: {}".format(nb_path))
+        test_notebook(nb_path.strip(), config)
 
-# Run tests on each notebook listed in the config.
-print("Test Configuration: ")
-print(config)
-for nb_path in notebooks_list:
-    os.chdir(ROOT)
-    print("Testing: {}".format(nb_path))
-    run_notebook(nb_path.strip(), config)
-
-# Print summary of tests ran.
-print("Summary: {}/{} tests passed.".format(SUCCESSES, SUCCESSES + EXCEPTIONS))
-print("Successful executions: ")
-print_notebook_executions(SUCCESSFUL_EXECUTIONS)
-
-# Throw exception if any test fails, so that the CodeBuild also fails.
-if EXCEPTIONS > 0:
-    print("Failed executions: ")
-    print_notebook_executions(FAILED_EXECUTIONS)
-    raise Exception("Test did not complete successfully")
+    for job in jobs:
+        job.join()
+
+    # Print summary of tests ran.
+    print("Summary: {}/{} tests passed.".format(SUCCESSES.value, SUCCESSES.value + EXCEPTIONS.value))
+    print("Successful executions: ")
+    print_notebook_executions(SUCCESSFUL_EXECUTIONS)
+
+    # Throw exception if any test fails, so that the CodeBuild also fails.
+    if EXCEPTIONS.value > 0:
+        print("Failed executions: ")
+        print_notebook_executions(FAILED_EXECUTIONS)
+        raise Exception("Test did not complete successfully")
+
+print("Total time taken for tests: ")
+print(datetime.now() - startTime)