Skip to content

Commit 398e2b8

Browse files
authored
Changes for temp location and out_dir with Sagemaker in mind (aws#154)
* Make outdir optional arg, use default path in sagemaker environment, also change temp location when writing local files * remove is_s3 import * add tests and fix case when / is at the front of filepath * add comments * change to .tmp suffix * update testing script to take a tag
1 parent c3a7554 commit 398e2b8

File tree

9 files changed

+133
-27
lines changed

9 files changed

+133
-27
lines changed

bin/sagemaker-containers/tensorflow/run_sagemaker.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
dir_path = os.path.dirname(os.path.realpath(__file__))
55
train_script_path = os.path.join(dir_path, 'tf-train.py')
66

7+
tag = os.environ.get('SM_TESTING_TAG', 'DEFAULTTAGWHICHWILLFAIL')
8+
79
estimator = TensorFlow(entry_point=train_script_path,
8-
image_name='072677473360.dkr.ecr.us-east-1.amazonaws.com/tornasole-preprod-tf-1.13.1-cpu:latest',
10+
image_name='072677473360.dkr.ecr.us-east-1.amazonaws.com/tornasole-preprod-tf-1.13.1-cpu:' + tag,
911
role='AmazonSageMaker-ExecutionRole-20190614T145575', # hardcode role name
1012
base_job_name='tornasole', #there are some restrictions on base job name so keep it simple
1113
train_instance_count=1,
@@ -15,7 +17,7 @@
1517
estimator.fit()
1618

1719
estimator = TensorFlow(entry_point=train_script_path,
18-
image_name='072677473360.dkr.ecr.us-east-1.amazonaws.com/tornasole-preprod-tf-1.13.1-gpu:latest',
20+
image_name='072677473360.dkr.ecr.us-east-1.amazonaws.com/tornasole-preprod-tf-1.13.1-gpu:' + tag,
1921
role='AmazonSageMaker-ExecutionRole-20190614T145575', # hardcode role name
2022
base_job_name='tornasole', #there are some restrictions on base job name so keep it simple
2123
train_instance_count=1,

tests/core/test_paths.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
from tornasole.core.hook_utils import verify_and_get_out_dir
2+
from tornasole.core.json_config import DEFAULT_SAGEMAKER_TORNASOLE_PATH
3+
from tornasole.core.access_layer.file import get_temp_path, SAGEMAKER_TEMP_PATH_SUFFIX, NON_SAGEMAKER_TEMP_PATH_PREFIX
4+
import os
5+
import uuid
6+
7+
8+
def test_outdir_non_sagemaker():
9+
id = str(uuid.uuid4())
10+
path = '/tmp/tests/' + id
11+
out_dir = verify_and_get_out_dir(path)
12+
assert out_dir == path
13+
os.makedirs(path)
14+
try:
15+
verify_and_get_out_dir(path)
16+
# should raise exception as dir present
17+
assert False
18+
except RuntimeError as e:
19+
pass
20+
21+
22+
def test_outdir_sagemaker():
23+
os.environ['TRAINING_JOB_NAME'] = 'a'
24+
id = str(uuid.uuid4())
25+
paths = ['/tmp/tests/' + id, 's3://tmp/tests/' + id]
26+
for path in paths:
27+
out_dir = verify_and_get_out_dir(path)
28+
assert out_dir == DEFAULT_SAGEMAKER_TORNASOLE_PATH
29+
del os.environ['TRAINING_JOB_NAME']
30+
31+
32+
def test_temp_paths():
33+
for path in ['/opt/ml/output/tensors/events/a',
34+
'/opt/ml/output/tensors/a',
35+
'/opt/ml/output/tensors/events/a/b']:
36+
tp = get_temp_path(path)
37+
assert tp.endswith(SAGEMAKER_TEMP_PATH_SUFFIX)
38+
assert not tp.startswith(NON_SAGEMAKER_TEMP_PATH_PREFIX)
39+
40+
for path in ['/a/b/c', '/opt/ml/output/a', 'a/b/c']:
41+
tp = get_temp_path(path)
42+
assert not SAGEMAKER_TEMP_PATH_SUFFIX in tp
43+
assert tp.startswith(NON_SAGEMAKER_TEMP_PATH_PREFIX)
44+
45+

tornasole/core/access_layer/file.py

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
1+
from tornasole.core.json_config import DEFAULT_SAGEMAKER_TORNASOLE_PATH
12
from .base import TSAccessBase
23
import os
34
import shutil
45

56

7+
NON_SAGEMAKER_TEMP_PATH_PREFIX = '/tmp'
8+
SAGEMAKER_TEMP_PATH_SUFFIX = '.tmp'
9+
10+
611
def ensure_dir(file_path, is_file=True):
712
if is_file:
813
directory = os.path.dirname(file_path)
@@ -13,6 +18,18 @@ def ensure_dir(file_path, is_file=True):
1318
os.makedirs(directory)
1419

1520

21+
def get_temp_path(file_path):
22+
directory = os.path.dirname(file_path)
23+
basename = os.path.basename(file_path)
24+
if directory.startswith(DEFAULT_SAGEMAKER_TORNASOLE_PATH):
25+
temp_path = file_path + SAGEMAKER_TEMP_PATH_SUFFIX
26+
else:
27+
if len(file_path) > 0 and file_path[0] == '/':
28+
file_path = file_path[1:]
29+
temp_path = os.path.join(NON_SAGEMAKER_TEMP_PATH_PREFIX, file_path)
30+
return temp_path
31+
32+
1633
WRITE_MODES = ['w', 'w+', 'wb', 'wb+', 'a', 'a+', 'ab', 'ab+']
1734

1835

@@ -24,7 +41,7 @@ def __init__(self, path, mode):
2441
ensure_dir(path)
2542

2643
if mode in WRITE_MODES:
27-
self.temp_path = os.path.join('/tmp', self.path)
44+
self.temp_path = get_temp_path(self.path)
2845
ensure_dir(self.temp_path)
2946
self.open(self.temp_path, mode)
3047
else:

tornasole/core/access_layer/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ def training_has_ended(trial_prefix):
2626
else:
2727
writer = TSAccessFile(file_path, 'a+')
2828
writer.write("end of training job")
29+
writer.flush()
2930
writer.close()
3031

3132

tornasole/core/hook_utils.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import os
2+
3+
from .json_config import DEFAULT_SAGEMAKER_TORNASOLE_PATH
4+
from .sagemaker_utils import is_sagemaker_job, get_sagemaker_out_dir
5+
from .utils import is_s3, check_dir_exists, get_logger
6+
7+
logger = get_logger()
8+
9+
10+
def verify_and_get_out_dir(out_dir):
11+
if is_sagemaker_job():
12+
if out_dir is not None:
13+
logger.warning(
14+
'The out_dir parameter was set but job is running in a '
15+
'SageMaker environment, hence it is being ignored. '
16+
'Writing tensors to '
17+
'{}'.format(DEFAULT_SAGEMAKER_TORNASOLE_PATH))
18+
out_dir = get_sagemaker_out_dir()
19+
# here we don't check whether the directory exists because
20+
# sagemaker creates the directory and it will exist.
21+
# sagemaker assures us that this directory is unique for each job
22+
# so there is no chance of tensors from two jobs being merged
23+
else:
24+
if out_dir is None:
25+
raise RuntimeError(
26+
'out_dir is a required argument when '
27+
'running outside of SageMaker environments')
28+
is_s3_path, _, _ = is_s3(out_dir)
29+
if not is_s3_path:
30+
out_dir = os.path.expanduser(out_dir)
31+
# we check and raise error if directory already exists because
32+
# we don't want to merge tensors from current job with
33+
# tensors from previous job
34+
check_dir_exists(out_dir)
35+
36+
return out_dir

tornasole/core/sagemaker_utils.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,21 @@
11
import boto3
2+
import os
3+
from .json_config import DEFAULT_SAGEMAKER_TORNASOLE_PATH
4+
5+
6+
def is_sagemaker_job():
7+
"""
8+
If this variable is defined we are assuming that this is
9+
a Sagemaker job. This is guaranteed to be defined
10+
for all Sagemaker jobs.
11+
https://docs.aws.amazon.com/sagemaker/latest/dg/your-algorithms-training-algo-running-container.html#your-algorithms-training-algo-running-container-environment-variables
12+
:return: True or False
13+
"""
14+
return 'TRAINING_JOB_NAME' in os.environ
15+
16+
17+
def get_sagemaker_out_dir():
18+
return DEFAULT_SAGEMAKER_TORNASOLE_PATH
219

320

421
class SageMakerUtils:

tornasole/mxnet/hook.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
from tornasole.core.save_config import SaveConfig
44
from tornasole.core.save_manager import SaveManager
55
from tornasole.core.modes import ModeKeys, ALLOWED_MODES
6-
from tornasole.core.utils import get_logger, is_s3
6+
from tornasole.core.utils import get_logger
77
from tornasole.core.reductions import get_reduction_tensor_name
88
from tornasole.core.json_config import TORNASOLE_CONFIG_DEFAULT_WORKER_NAME, create_hook_from_json_config
99
from tornasole.core.access_layer.utils import training_has_ended
10+
from tornasole.core.hook_utils import verify_and_get_out_dir
1011
from .mxnet_collection import get_collection_manager, get_collection
1112
from .util import get_aggregated_data, make_numpy_array
1213
import re as _re
@@ -27,20 +28,15 @@
2728

2829
class TornasoleHook:
2930
def __init__(self,
30-
out_dir,
31+
out_dir=None,
3132
dry_run=False,
3233
worker=TORNASOLE_CONFIG_DEFAULT_WORKER_NAME,
3334
reduction_config=None,
3435
save_config=None,
3536
include_regex=None,
3637
include_collections=DEFAULT_INCLUDE_COLLECTIONS,
3738
save_all=False):
38-
if not is_s3(out_dir)[0]:
39-
out_dir = os.path.expanduser(out_dir)
40-
## This is commented becuase Sagemaker creates out_dir
41-
# This was created because we don't want user to overwrite their existing data
42-
# check_dir_exists(out_dir)
43-
self.out_dir = out_dir
39+
self.out_dir = verify_and_get_out_dir(out_dir)
4440
self.out_base_dir = os.path.dirname(out_dir)
4541
self.run_id = os.path.basename(out_dir)
4642
self.include_collections = include_collections

tornasole/pytorch/hook.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from tornasole.core.save_config import SaveConfig
44
from tornasole.core.save_manager import SaveManager
55
from tornasole.core.modes import ModeKeys, ALLOWED_MODES
6-
from tornasole.core.utils import get_logger, is_s3
6+
from tornasole.core.utils import get_logger
7+
from tornasole.core.hook_utils import verify_and_get_out_dir
78
from tornasole.core.reductions import get_reduction_tensor_name
89
from tornasole.core.json_config import create_hook_from_json_config
910
from tornasole.pytorch.torch_collection import get_collection_manager, get_collection
@@ -27,20 +28,15 @@
2728

2829
class TornasoleHook:
2930
def __init__(self,
30-
out_dir,
31+
out_dir=None,
3132
dry_run=False,
3233
worker=DEFAULT_WORKER_NAME,
3334
reduction_config=None,
3435
save_config=None,
3536
include_regex=None,
3637
include_collections=DEFAULT_INCLUDE_COLLECTIONS,
3738
save_all=False):
38-
if not is_s3(out_dir)[0]:
39-
out_dir = os.path.expanduser(out_dir)
40-
# this is commented because SM creates dir.
41-
# This was created because we don't want user to overwrite their existing data
42-
#check_dir_exists(out_dir)
43-
self.out_dir = out_dir
39+
self.out_dir = verify_and_get_out_dir(out_dir)
4440
self.out_base_dir = os.path.dirname(out_dir)
4541
self.run_id = os.path.basename(out_dir)
4642
self.include_collections = include_collections

tornasole/tensorflow/hook.py

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,8 @@
66
from .reductions import get_tensorflow_reduction
77
from .collection import *
88
from tornasole.core.writer import FileWriter
9-
from tornasole.core.utils import get_logger, flatten, is_s3, match_inc
9+
from tornasole.core.utils import get_logger, flatten, match_inc
10+
from tornasole.core.hook_utils import verify_and_get_out_dir
1011
from tornasole.core.reductions import get_reduction_tensor_name
1112
from tornasole.core.json_config import TORNASOLE_CONFIG_DEFAULT_WORKER_NAME, create_hook_from_json_config
1213
from tornasole.core.modes import ModeKeys, ALLOWED_MODES
@@ -17,7 +18,7 @@
1718
DEFAULT_INCLUDE_COLLECTIONS = ['weights', 'gradients', 'default']
1819

1920
class TornasoleHook(tf.train.SessionRunHook):
20-
def __init__(self, out_dir,
21+
def __init__(self, out_dir=None,
2122
dry_run=False,
2223
worker=TORNASOLE_CONFIG_DEFAULT_WORKER_NAME,
2324
reduction_config=None,
@@ -68,12 +69,7 @@ def __init__(self, out_dir,
6869
a shortcut for saving all tensors in the model.
6970
they are all saved in the collection `all`
7071
"""
71-
if not is_s3(out_dir)[0]:
72-
out_dir = os.path.expanduser(out_dir)
73-
# this is commented because SM creates dir.
74-
# This was created because we don't want user to overwrite their existing data
75-
#check_dir_exists(out_dir)
76-
self.out_dir = out_dir
72+
self.out_dir = verify_and_get_out_dir(out_dir)
7773
self.out_base_dir = os.path.dirname(out_dir)
7874
self.run_id = os.path.basename(out_dir)
7975

0 commit comments

Comments
 (0)