Skip to content

Fix TensorFlow log syncing bug #105

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 22, 2018
Merged
53 changes: 49 additions & 4 deletions src/sagemaker/tensorflow/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,10 @@
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import contextlib
import logging
import os
import shutil
import subprocess
import tempfile
import threading
Expand Down Expand Up @@ -46,6 +48,47 @@ def _cmd_exists(cmd):
for path in os.environ["PATH"].split(os.pathsep)
)

@staticmethod
def _sync_directories(from_directory, to_directory):
"""Sync to_directory with from_directory by copying each file in
to_directory with new contents. Files in to_directory will be
overwritten by files of the same name in from_directory. We need to
keep two copies of the log directory because otherwise TensorBoard
picks up temp files from `aws s3 sync` and then stops reading the
correct tfevent files. We walk the directory and copy each file
individually because the directory that TensorBoard watches needs to
always exist.

Args:
from_directory (str): The directory with updated files.
to_directory (str): The directory to be synced.
"""
if not os.path.exists(to_directory):
os.mkdir(to_directory)
for root, dirs, files in os.walk(from_directory):
to_root = root.replace(from_directory, to_directory)
for directory in dirs:
to_child_dir = os.path.join(to_root, directory)
if not os.path.exists(to_child_dir):
os.mkdir(to_child_dir)
for fname in files:
from_file = os.path.join(root, fname)
to_file = os.path.join(to_root, fname)
with open(from_file, 'rb') as a, open(to_file, 'wb') as b:
b.write(a.read())

@staticmethod
@contextlib.contextmanager
def _temporary_directory():
"""Context manager for a temporary directory. This is similar to
tempfile.TemporaryDirectory in python>=3.2.
"""
name = tempfile.mkdtemp()
try:
yield name
finally:
shutil.rmtree(name)

def validate_requirements(self):
"""Ensure that TensorBoard and the AWS CLI are installed.

Expand Down Expand Up @@ -96,10 +139,12 @@ def run(self):
LOGGER.info('TensorBoard 0.1.7 at http://localhost:{}'.format(port))
while not self.estimator.checkpoint_path:
self.event.wait(1)
while not self.event.is_set():
args = ['aws', 's3', 'sync', self.estimator.checkpoint_path, self.logdir]
subprocess.call(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
self.event.wait(10)
with self._temporary_directory() as aws_sync_dir:
while not self.event.is_set():
args = ['aws', 's3', 'sync', self.estimator.checkpoint_path, aws_sync_dir]
subprocess.call(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
self._sync_directories(aws_sync_dir, self.logdir)
self.event.wait(10)
tensorboard_process.terminate()


Expand Down
92 changes: 92 additions & 0 deletions tests/unit/test_sync_directories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
import filecmp
import os
import random
import shutil

from sagemaker.tensorflow.estimator import Tensorboard


def create_test_directory(directory, variable_content="hello world"):
"""Create dummy data for testing Tensorboard._sync_directories with the
following structure:

<directory>
|_ child_directory
|_ hello.txt
|_ foo1.txt
|_ foo2.txt

Args:
directory (str): The path to a directory to create with fake files
variable_content (str): Content to put in one of the files
"""
child_dir = os.path.join(directory, 'child_directory')
os.mkdir(child_dir)
with open(os.path.join(directory, 'foo1.txt'), 'w') as f:
f.write('bar1')
with open(os.path.join(directory, 'foo2.txt'), 'w') as f:
f.write('bar2')
with open(os.path.join(child_dir, 'hello.txt'), 'w') as f:
f.write(variable_content)


def same_dirs(a, b):
"""Check that structure and files are the same for directories a and b

Args:
a (str): The path to the first directory
b (str): The path to the second directory
"""
comp = filecmp.dircmp(a, b)
common = sorted(comp.common)
left = sorted(comp.left_list)
right = sorted(comp.right_list)
if left != common or right != common:
return False
if len(comp.diff_files):
return False
for subdir in comp.common_dirs:
left_subdir = os.path.join(a, subdir)
right_subdir = os.path.join(b, subdir)
return same_dirs(left_subdir, right_subdir)
return True


def test_to_directory_doesnt_exist():
with Tensorboard._temporary_directory() as from_dir:
create_test_directory(from_dir)
to_dir = './not_a_real_place_{}'.format(random.getrandbits(64))
Tensorboard._sync_directories(from_dir, to_dir)
assert same_dirs(from_dir, to_dir)
shutil.rmtree(to_dir)


def test_only_root_of_to_directory_exists():
with Tensorboard._temporary_directory() as from_dir:
with Tensorboard._temporary_directory() as to_dir:
create_test_directory(from_dir)
assert not same_dirs(from_dir, to_dir)
Tensorboard._sync_directories(from_dir, to_dir)
assert same_dirs(from_dir, to_dir)


def test_files_are_overwritten_when_they_already_exist():
with Tensorboard._temporary_directory() as from_dir:
with Tensorboard._temporary_directory() as to_dir:
create_test_directory(from_dir)
create_test_directory(to_dir, "foo bar")
assert not same_dirs(from_dir, to_dir)
Tensorboard._sync_directories(from_dir, to_dir)
assert same_dirs(from_dir, to_dir)