Skip to content

Add model saving warning at end of training #171

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Feb 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions docker/1.12.0/Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -76,14 +76,13 @@ COPY $framework_support_installable .

RUN pip install --no-cache-dir -U \
keras==2.2.4 \
sagemaker-containers==2.4.2 \
$framework_support_installable \
"sagemaker-tensorflow>=1.12,<1.13" && \
# Let's install TensorFlow separately in the end to avoid
# the library version to be overwritten
pip install --force-reinstall --no-cache-dir -U \
tensorflow-1.12.0-py2.py3-none-any.whl \
horovod && \
pip install --no-cache-dir -U $framework_support_installable && \
rm -f tensorflow-1.12.0-py2.py3-none-any.whl && \
rm -f $framework_support_installable && \
pip uninstall -y --no-cache-dir \
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def read(fname):
'Programming Language :: Python :: 3.6',
],

install_requires=['sagemaker-containers>=2.3.4', 'numpy', 'scipy', 'sklearn',
install_requires=['sagemaker-containers>=2.4.4', 'numpy', 'scipy', 'sklearn',
'pandas', 'Pillow', 'h5py'],
extras_require={
'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock',
Expand Down
29 changes: 29 additions & 0 deletions src/sagemaker_tensorflow_container/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
logger = logging.getLogger(__name__)

SAGEMAKER_PARAMETER_SERVER_ENABLED = 'sagemaker_parameter_server_enabled'
MODEL_DIR = '/opt/ml/model'


def _is_host_master(hosts, current_host):
Expand Down Expand Up @@ -159,6 +160,33 @@ def train(env):
runner=runner_type)


def _log_model_missing_warning(model_dir):
pb_file_exists = False
file_exists = False
for dirpath, dirnames, filenames in os.walk(model_dir):
if filenames:
file_exists = True
for f in filenames:
if 'saved_model.pb' in f or 'saved_model.pbtxt' in f:
pb_file_exists = True
path, direct_parent_dir = os.path.split(dirpath)
if not str.isdigit(direct_parent_dir):
logger.warn('Your model will NOT be servable with SageMaker TensorFlow Serving containers.'
'The SavedModel bundle is under directory \"{}\", not a numeric name.'
.format(direct_parent_dir))

if not file_exists:
logger.warn('No model artifact is saved under path {}.'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't we toss an exception here?

Are there any use cases of users not utilizing /opt/ml/model for packaging their models to S3?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They could have saved models or checkpoints to s3 during training. saving it to '/opt/ml/model' just to make sure ease will upload it at end of training.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you provide any links or references to users expecting to have had their training artifacts in /opt/ml/model, however they didn't know?

' Your training job will not save any model files to S3.\n'
'For details of how to construct your training script see:\n'
'https://github.com/aws/sagemaker-python-sdk/tree/master/src/sagemaker/tensorflow#adapting-your-local-tensorflow-script' # noqa
.format(model_dir))
elif not pb_file_exists:
logger.warn('Your model will NOT be servable with SageMaker TensorFlow Serving container.'
'The model artifact was not saved in the TensorFlow SavedModel directory structure:\n'
'https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory')


def main():
"""Training entry point
"""
Expand All @@ -167,3 +195,4 @@ def main():
s3_utils.configure(env.hyperparameters.get('model_dir'), os.environ.get('SAGEMAKER_REGION'))
logger.setLevel(env.log_level)
train(env)
_log_model_missing_warning(MODEL_DIR)
Empty file.
Empty file.
Empty file.
41 changes: 39 additions & 2 deletions test/unit/test_training.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
PS_TASK_2 = {'index': 1, 'type': 'ps'}
MODEL_DIR = 's3://bucket/prefix'
REGION = 'us-west-2'
RESOURCE_PATH = os.path.join(os.path.dirname(__file__), '..', 'resources')


@pytest.fixture
Expand Down Expand Up @@ -200,18 +201,54 @@ def test_build_tf_config_error():
assert 'Cannot have a ps task if there are no parameter servers in the cluster' in str(error)


@patch('sagemaker_tensorflow_container.training.logger')
def test_log_model_missing_warning_no_model(logger):
path = os.path.join(RESOURCE_PATH, 'test_dir_empty')
if not os.path.exists(path):
os.mkdir(path)
training._log_model_missing_warning(path)
logger.warn.assert_called_with('No model artifact is saved under path {}.'
' Your training job will not save any model files to S3.\n'
'For details of how to construct your training script see:\n'
'https://github.com/aws/sagemaker-python-sdk/tree/master/src/sagemaker/tensorflow#adapting-your-local-tensorflow-script' # noqa
.format(path))


@patch('sagemaker_tensorflow_container.training.logger')
def test_log_model_missing_warning_wrong_format(logger):
training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_wrong_model'))
logger.warn.assert_called_with('Your model will NOT be servable with SageMaker TensorFlow Serving container.'
'The model artifact was not saved in the TensorFlow '
'SavedModel directory structure:\n'
'https://www.tensorflow.org/guide/saved_model#structure_of_a_savedmodel_directory')


@patch('sagemaker_tensorflow_container.training.logger')
def test_log_model_missing_warning_wrong_parent_dir(logger):
training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_wrong_parent_dir'))
logger.warn.assert_called_with('Your model will NOT be servable with SageMaker TensorFlow Serving containers.'
'The SavedModel bundle is under directory \"{}\", not a numeric name.'
.format('not-digit'))


@patch('sagemaker_tensorflow_container.training.logger')
def test_log_model_missing_warning_correct(logger):
training._log_model_missing_warning(os.path.join(RESOURCE_PATH, 'test_dir_correct_model'))
logger.warn.assert_not_called()


@patch('sagemaker_tensorflow_container.training.logger')
@patch('sagemaker_tensorflow_container.training.train')
@patch('logging.Logger.setLevel')
@patch('sagemaker_containers.beta.framework.training_env')
@patch('sagemaker_containers.beta.framework.env.read_hyperparameters', return_value={})
@patch('sagemaker_tensorflow_container.s3_utils.configure')
def test_main(configure_s3_env, read_hyperparameters, training_env,
set_level, train, single_machine_training_env):
set_level, train, logger, single_machine_training_env):
training_env.return_value = single_machine_training_env
os.environ['SAGEMAKER_REGION'] = REGION
training.main()
read_hyperparameters.assert_called_once_with()
training_env.assert_called_once_with(hyperparameters={})
set_level.assert_called_once_with(LOG_LEVEL)
train.assert_called_once_with(single_machine_training_env)
configure_s3_env.assert_called_once()
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ python =
3.6: py36, flake8

[flake8]
max-line-length = 100
max-line-length = 120
exclude =
build/
.git
Expand Down