Skip to content

Commit 1ac5e38

Browse files
NihalHarishrahul003
authored andcommitted
DT Trial APIs (aws#239)
* test * added missing import * get worker name from func * random seed: * str error * error handling * hvd-size * worker name to collection * removed unecessary comment * try catch for modulenotfound * simplified mkdir error handling * handle hvd.size ValueError * modify try except block * moving worker name to utils * worker to collections.ts * list collection files * checkpoint_1 * Mirrored Variable * mirrored var * revert accidental change * print statement cleanup * save json file * update num workers * s3 mode trial api * tests starting * fixed test trial modes * bug fix * removed randint generator * pytorch test fix * fixed bug in test * merge conflicts and tf tests * merge alpha to fix breaking tests * code cleanup * fixed get correct worker bug * fixed None not 0 bug * made worker name for hvd more verbose * fixed mode step bug * changed worker name parsing logic * missing line * removed hard coded worker_num values * paramterized collections name * changes for PR comments * PR changes * step variable rename and refactor * reverting 4eeb4d4 * refactored changes * read test * refactor changes * handle import error for pytorch * Handle Import Error * removed worker from hook, set default value to num_workers, addressed PR comments * doc string update * replace StepIsUnavailable with ValueError and fix return with raise
1 parent 35e5ce0 commit 1ac5e38

35 files changed

+374
-181
lines changed

tests/analysis/rules/test_confusion.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from tornasole.core.writer import FileWriter
2-
from tornasole.core.collection_manager import CollectionManager, COLLECTIONS_FILE_NAME
2+
from tornasole.core.collection_manager import CollectionManager
3+
from tornasole.core.config_constants import TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME
34

45
from tornasole.rules.generic import Confusion
56
from tornasole.trials import create_trial
@@ -10,7 +11,8 @@
1011
from tornasole.exceptions import *
1112
from tornasole.rules.rule_invoker import invoke_rule
1213

13-
def gen_y_and_y_hat( path, trial, step, y, y_name, y_hat, y_hat_name, colls = {} ):
14+
15+
def gen_y_and_y_hat( path, trial, step, y, y_name, y_hat, y_hat_name, colls={}):
1416
trial_dir=os.path.join(path, trial)
1517
with FileWriter(trial_dir=trial_dir,
1618
step=step, worker='algo-1') as fw:
@@ -20,7 +22,7 @@ def gen_y_and_y_hat( path, trial, step, y, y_name, y_hat, y_hat_name, colls = {}
2022
for coll in colls:
2123
c.add(coll)
2224
c.get(coll).tensor_names = colls[coll]
23-
c.export(os.path.join(trial_dir, COLLECTIONS_FILE_NAME))
25+
c.export(os.path.join(trial_dir, TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
2426

2527

2628
def test_confusion():
@@ -32,19 +34,19 @@ def test_confusion():
3234
run_id = str(uuid.uuid4())
3335
y = np.random.randint(cat_no,size=(100,))
3436
y_hat = y
35-
gen_y_and_y_hat( path, run_id, 0, y, 'y', y_hat, 'y_hat', colls = { 'labels' : [ 'y'], 'preds': ['y_hat'] } )
37+
gen_y_and_y_hat( path, run_id, 0, y, 'y', y_hat, 'y_hat', colls = { 'labels' : [ 'y'], 'preds': ['y_hat']})
3638
tr = create_trial(os.path.join(path, run_id))
37-
r = Confusion(tr, labels_collection='labels', predictions_collection='preds' )
39+
r = Confusion(tr, labels_collection='labels', predictions_collection='preds')
3840
invoke_rule(r, start_step=0, end_step=1, raise_eval_cond=True)
3941

4042
# Test 2: should fail on row 4 because the
4143
run_id = str(uuid.uuid4())
4244
y = np.arange(cat_no)
4345
y_hat = np.copy(y)
4446
y_hat[4] = 7
45-
gen_y_and_y_hat( path, run_id, 1, y, 'foo', y_hat, 'bar' )
47+
gen_y_and_y_hat( path, run_id, 1, y, 'foo', y_hat, 'bar')
4648
tr = create_trial(os.path.join(path, run_id))
47-
r = Confusion(tr, cat_no, 'foo', 'bar' )
49+
r = Confusion(tr, cat_no, 'foo', 'bar')
4850
try:
4951
invoke_rule(r, start_step=1, end_step=2, raise_eval_cond=True)
5052
assert False
@@ -64,7 +66,7 @@ def test_confusion():
6466
y = np.arange(10)
6567
y_hat = y
6668
# 'label' and 'pred' are magic names
67-
gen_y_and_y_hat( path, run_id, 1, y, 'labels', y_hat, 'predictions' )
69+
gen_y_and_y_hat( path, run_id, 1, y, 'labels', y_hat, 'predictions')
6870
tr = create_trial(os.path.join(path, run_id))
6971
r = Confusion(tr)
7072
invoke_rule(r, start_step=1, end_step=2, raise_eval_cond=True)

tests/analysis/trials/test_modes.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,9 @@
55
from tornasole.core.tensor import StepState
66
from datetime import datetime
77
from tornasole.core.writer import FileWriter
8-
from tornasole.core.collection_manager import CollectionManager, COLLECTIONS_FILE_NAME
8+
from tornasole.core.collection_manager import CollectionManager
9+
from tornasole.core.config_constants import TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME
10+
911

1012
def test_modes_on_global_data():
1113
pass # other tests in create, local, s3 do this
@@ -18,7 +20,7 @@ def test_mode_data():
1820
c = CollectionManager()
1921
c.add("default")
2022
c.get("default").tensor_names = ["arr"]
21-
c.export(os.path.join(trial_dir, COLLECTIONS_FILE_NAME))
23+
c.export(os.path.join(trial_dir, TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
2224
tr = create_trial(trial_dir)
2325
for s in range(0, 10):
2426
fw = FileWriter(trial_dir=trial_dir, step=s)

tests/analysis/trials/test_s3.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from tornasole.trials import S3Trial
2-
from tornasole.core.collection_manager import CollectionManager, \
3-
COLLECTIONS_FILE_NAME
2+
from tornasole.core.collection_manager import CollectionManager
3+
from tornasole.core.config_constants import TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME
44
import uuid
55
import os
66
import pytest
@@ -33,8 +33,8 @@ def help_test_multiple_trials(num_steps = 20, num_tensors = 10):
3333
c = CollectionManager()
3434
c.add("default")
3535
c.get("default").tensor_names = ["foo_" + str(i) for i in range(num_tensors)]
36-
c.export(path + trial_name + "/" + COLLECTIONS_FILE_NAME)
37-
c.export(path + trial_name + "/" + COLLECTIONS_FILE_NAME)
36+
c.export(path + trial_name + "/" + TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME)
37+
c.export(path + trial_name + "/" + TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME)
3838
for i in range(num_steps):
3939
generate_data(path=path, trial=trial_name, num_tensors=num_tensors,
4040
step=i, tname_prefix='foo', worker='algo-1', shape=(3, 3, 3), rank=0)

tests/analysis/utils.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
from tornasole.core.writer import FileWriter
22
import numpy as np
3-
from tornasole.core.collection_manager import CollectionManager, \
4-
COLLECTIONS_FILE_NAME
3+
from tornasole.core.collection_manager import CollectionManager
4+
from tornasole.core.config_constants import TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME
55
import os
66
import aioboto3
77
import asyncio
88
from tornasole.core.access_layer.s3handler import S3Handler, ListRequest
99

10+
1011
def generate_data(path, trial, step, tname_prefix,
1112
num_tensors, worker, shape, dtype=np.float32,
1213
rank=None, mode=None, mode_step=None, export_colls=True,
@@ -24,7 +25,7 @@ def generate_data(path, trial, step, tname_prefix,
2425
c.get("default").tensor_names = [f'{tname_prefix}_{i}' for i in range(num_tensors)]
2526
c.add('gradients')
2627
c.get("gradients").tensor_names = [f'{tname_prefix}_{i}' for i in range(num_tensors)]
27-
c.export(os.path.join(path, trial, COLLECTIONS_FILE_NAME))
28+
c.export(os.path.join(path, trial, TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
2829

2930

3031
def check_trial(trial_obj, num_steps, num_tensors):

tests/core/test_collections.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
from tornasole.core.collection import Collection
2-
from tornasole.core.collection_manager import CollectionManager, \
3-
COLLECTIONS_FILE_NAME
2+
from tornasole.core.collection_manager import CollectionManager
3+
from tornasole.core.config_constants import TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME
44
from tornasole.core.reduction_config import ReductionConfig
55
from tornasole.core.save_config import SaveConfig, SaveConfigMode
66
from tornasole.core.modes import ModeKeys
@@ -42,8 +42,8 @@ def test_manager_export_load():
4242
cm.add(Collection('trial1'))
4343
cm.add('trial2')
4444
cm.get('trial2').include('total_loss')
45-
cm.export(COLLECTIONS_FILE_NAME)
46-
cm2 = CollectionManager.load(COLLECTIONS_FILE_NAME)
45+
cm.export(TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME)
46+
cm2 = CollectionManager.load(TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME)
4747
assert cm == cm2
4848

4949
def test_manager():

tests/core/test_modes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def test_mode_writing():
1616
else:
1717
fw.write_tensor(tdata=np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32),
1818
tname='arr', mode=ModeKeys.EVAL, mode_step=s // 2)
19-
fw.close()
19+
fw.close()
2020
files = glob.glob('ts_outputs/' + run_id + '/**/*.tfevents',
2121
recursive=True)
2222
for f in files:

tests/pytorch/test_simple_write.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,8 @@ def helper_test_weights_bias_gradients(hook=None):
178178

179179

180180
def saveall_test_helper(hook=None):
181-
reset_collections()
181+
if hook is None:
182+
reset_collections()
182183
prefix = str(uuid.uuid4())
183184
hook_type = 'saveall'
184185
device = torch.device("cpu")
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from tornasole.trials import create_trial
2+
3+
4+
def test_s3_read():
5+
path = "s3://tornasole-testing/dist-logs-10/"
6+
trial = create_trial(path)
7+
tensors = trial.tensors()
8+
assert len(tensors) == 17
9+
t = trial.tensor('gradients/dense_1/MatMul_grad/tuple/control_dependency_1:0')
10+
steps = t.steps()
11+
assert steps == [0]
12+
workers = t.workers_for_step(0)
13+
assert len(workers) == 16
14+
truth_table = t.value(0, worker='worker_10') == t.value(0, worker='worker_1')
15+
assert truth_table.all() == False

tests/tensorflow/hooks/test_reductions.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def helper_test_reductions(trial_dir, hook):
2626
assert len(t.reduction_values(0)) == 18
2727
for r in ALLOWED_REDUCTIONS + ALLOWED_NORMS:
2828
for b in [False, True]:
29-
assert t.reduction_value(0, reduction_name=r, abs=b) is not None
29+
assert t.reduction_value(0, reduction_name=r, abs=b, worker=None) is not None
3030

3131

3232
def test_reductions():

tests/tensorflow/hooks/test_save_all_full.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import shutil, glob
44
from tornasole.core.reader import FileReader
55
from tornasole.core.json_config import TORNASOLE_CONFIG_FILE_PATH_ENV_STR
6-
from tornasole.core.collection_manager import COLLECTIONS_FILE_NAME
6+
from tornasole.core.config_constants import TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME
77

88

99
def test_save_all_full(hook=None, trial_dir=None):
@@ -29,8 +29,8 @@ def test_save_all_full(hook=None, trial_dir=None):
2929
assert len(coll['gradients'].tensor_names) == 1
3030
assert len(coll['losses'].tensor_names) == 1
3131

32-
assert COLLECTIONS_FILE_NAME in files
33-
cm = CollectionManager.load(join(trial_dir, COLLECTIONS_FILE_NAME))
32+
assert TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME in files
33+
cm = CollectionManager.load(join(trial_dir, TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
3434

3535
assert len(cm.collections) == 6
3636
assert len(cm.collections['weights'].tensor_names) == 1

tests/tensorflow/hooks/test_save_config.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
from tornasole.trials import create_trial
2-
from tornasole import SaveConfig
32
from .utils import *
43
from tests.tensorflow.hooks.test_estimator_modes import help_test_mnist
54
from tornasole.tensorflow import reset_collections, get_collection, TornasoleHook, modes

tests/tensorflow/hooks/test_save_reductions.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,9 @@
22
from tornasole.tensorflow import reset_collections, get_collections, CollectionManager
33
import shutil
44
import glob
5-
from tornasole.core.collection_manager import COLLECTIONS_FILE_NAME
65
from tornasole.core.reader import FileReader
76
from tornasole.core.json_config import TORNASOLE_CONFIG_FILE_PATH_ENV_STR
7+
from tornasole.core.config_constants import TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME
88

99
def helper_save_reductions(trial_dir, hook):
1010
simple_model(hook)
@@ -14,8 +14,8 @@ def helper_save_reductions(trial_dir, hook):
1414
assert len(coll['weights'].tensor_names) == 1
1515
assert len(coll['gradients'].tensor_names) == 1
1616

17-
assert COLLECTIONS_FILE_NAME in files
18-
cm = CollectionManager.load(join(trial_dir, COLLECTIONS_FILE_NAME))
17+
assert TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME in files
18+
cm = CollectionManager.load(join(trial_dir, TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
1919
assert len(cm.collections) == 5
2020
assert len(cm.collections['weights'].tensor_names) == 1
2121
assert len(cm.collections['gradients'].tensor_names) == 1

tests/tensorflow/hooks/test_simple_include.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
from .utils import *
2-
from tornasole.tensorflow import reset_collections, get_collection
2+
from tornasole.tensorflow import get_collection
33
import tornasole.tensorflow as ts
44
import glob, shutil
55
from tornasole.core.reader import FileReader
66
from tornasole.core.json_config import TORNASOLE_CONFIG_FILE_PATH_ENV_STR
7-
from tornasole.core.collection_manager import COLLECTIONS_FILE_NAME
7+
from tornasole.core.config_constants import TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME
8+
89

910

1011
def helper_test_simple_include(trial_dir, hook):
@@ -13,7 +14,7 @@ def helper_test_simple_include(trial_dir, hook):
1314
_, files = get_dirs_files(trial_dir)
1415
steps, _ = get_dirs_files(os.path.join(trial_dir, 'events'))
1516

16-
cm = CollectionManager.load(join(trial_dir, COLLECTIONS_FILE_NAME))
17+
cm = CollectionManager.load(join(trial_dir, TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
1718
assert len(cm.collections['default'].tensor_names) == 1
1819
assert len(steps) == 5
1920
for step in steps:
@@ -57,7 +58,7 @@ def helper_test_simple_include_regex(trial_dir, hook):
5758
_, files = get_dirs_files(trial_dir)
5859
steps, _ = get_dirs_files(os.path.join(trial_dir, 'events'))
5960

60-
cm = CollectionManager.load(join(trial_dir, COLLECTIONS_FILE_NAME))
61+
cm = CollectionManager.load(join(trial_dir, TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
6162
assert len(cm.collections['default'].tensor_names) == 1
6263
assert len(steps) == 5
6364

@@ -104,7 +105,7 @@ def helper_test_multi_collection_match(trial_dir, hook):
104105
_, files = get_dirs_files(trial_dir)
105106
steps, _ = get_dirs_files(os.path.join(trial_dir, 'events'))
106107

107-
cm = CollectionManager.load(join(trial_dir, COLLECTIONS_FILE_NAME))
108+
cm = CollectionManager.load(join(trial_dir, TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
108109
assert len(cm.collections['default'].tensor_names) == 1
109110
assert len(cm.collections['trial'].tensor_names) == 1
110111
assert len(steps) == 5

tests/tensorflow/hooks/test_weights_gradients.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from .utils import *
22
from tornasole.tensorflow import reset_collections
33
import tensorflow.compat.v1 as tf
4-
from tornasole.core.collection_manager import COLLECTIONS_FILE_NAME
4+
from tornasole.core.config_constants import TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME
55
from tornasole.core.json_config import TORNASOLE_CONFIG_FILE_PATH_ENV_STR
66
import tornasole.tensorflow as ts
77
import shutil
@@ -12,8 +12,8 @@ def helper_test_only_w_g(trial_dir, hook):
1212
steps, _ = get_dirs_files(os.path.join(trial_dir, 'events'))
1313
_, files = get_dirs_files(trial_dir)
1414

15-
assert COLLECTIONS_FILE_NAME in files
16-
cm = CollectionManager.load(join(trial_dir, COLLECTIONS_FILE_NAME))
15+
assert TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME in files
16+
cm = CollectionManager.load(join(trial_dir, TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME))
1717
num_tensors_loaded_collection = len(cm.collections['weights'].tensor_names) + \
1818
len(cm.collections['gradients'].tensor_names) + \
1919
len(cm.collections['default'].tensor_names)

tests/xgboost/test_hook.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import os
2-
import json
32
import uuid
43
import numpy as np
54
import pytest

tornasole/core/access_layer/file.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,8 @@ def ensure_dir(file_path, is_file=True):
1212
directory = os.path.dirname(file_path)
1313
else:
1414
directory = file_path
15-
1615
if directory and not os.path.exists(directory):
17-
os.makedirs(directory)
16+
os.makedirs(directory, exist_ok=True)
1817

1918

2019
def get_temp_path(file_path):

tornasole/core/collection_manager.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@
33
from .utils import is_s3, load_json_as_dict
44
import json
55

6-
ALLOWED_PARAMS = ['collections']
7-
COLLECTIONS_FILE_NAME = 'collections.json'
6+
ALLOWED_PARAMS = ['collections', '_meta']
7+
88

99
class CollectionManager:
1010
"""
@@ -16,6 +16,8 @@ def __init__(self, collections=None, create_default=False):
1616
if collections is None:
1717
collections = {}
1818
self.collections = collections
19+
self._meta = {}
20+
self._meta['num_workers'] = 1
1921

2022
def create_collection(self, name, cls=Collection):
2123
if name not in self.collections:
@@ -40,6 +42,18 @@ def get(self, name, create=True):
4042
raise KeyError(f'Collection {name} has not been created')
4143
return self.collections[name]
4244

45+
def update_meta(self, meta):
46+
assert isinstance(meta, dict)
47+
self._meta.update(meta)
48+
49+
50+
def get_num_workers(self):
51+
return int(self._meta['num_workers'])
52+
53+
54+
def set_num_workers(self, num_workers):
55+
self._meta['num_workers'] = int(num_workers)
56+
4357
def to_json_dict(self):
4458
d = dict()
4559
for a, v in self.__dict__.items():
@@ -81,6 +95,7 @@ def load_from_string(cls, s, collection_class=Collection):
8195
for c_name, c_dict in params['collections'].items():
8296
coll = collection_class.from_dict(c_dict)
8397
cm.add(coll)
98+
cm.update_meta(params['_meta'])
8499
return cm
85100

86101
def __eq__(self, other):
@@ -90,4 +105,4 @@ def __eq__(self, other):
90105

91106
def __repr__(self):
92107
return f"<class CollectionManager: " \
93-
f"collection_names={list(self.collections.keys())}>"
108+
f"collection_names={list(self.collections.keys())}>"

tornasole/core/config_constants.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
TORNASOLE_CONFIG_DEFAULT_WORKER_NAME = "worker0"
1+
TORNASOLE_CONFIG_DEFAULT_WORKER_NAME = "worker_0"
22
TORNASOLE_CONFIG_FILE_PATH_ENV_STR = "TORNASOLE_CONFIG_FILE_PATH"
33
DEFAULT_CONFIG_FILE_PATH = "/opt/ml/input/data/tornasole-config/tornasole-hook-config.json"
44
TORNASOLE_CONFIG_REDUCTION_CONFIGS_KEY = "reduction_configs"
@@ -9,3 +9,4 @@
99
TORNASOLE_CONFIG_INCLUDE_REGEX_KEY = "include_regex"
1010
TORNASOLE_CONFIG_SAVE_ALL_KEY = "save_all"
1111
DEFAULT_SAGEMAKER_TORNASOLE_PATH = "/opt/ml/output/tensors"
12+
TORASOLE_DEFAULT_COLLECTIONS_FILE_NAME = 'worker_0_collections.json'

0 commit comments

Comments
 (0)