Skip to content

Commit da14a6b

Browse files
Ben Cookwinstonaws
authored andcommitted
Fix TensorFlow log syncing bug (#105)
* Support multi-part uploads Ignore Emacs backup files * copy tf files to clean directory for tensorboard * cleanup temp directory * explain approach in docstring * unit tests for _sync_directories * clean up * make context manager private method * simplify directory comparison
1 parent 2af9d45 commit da14a6b

File tree

2 files changed

+141
-4
lines changed

2 files changed

+141
-4
lines changed

src/sagemaker/tensorflow/estimator.py

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,10 @@
1010
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
1111
# ANY KIND, either express or implied. See the License for the specific
1212
# language governing permissions and limitations under the License.
13+
import contextlib
1314
import logging
1415
import os
16+
import shutil
1517
import subprocess
1618
import tempfile
1719
import threading
@@ -46,6 +48,47 @@ def _cmd_exists(cmd):
4648
for path in os.environ["PATH"].split(os.pathsep)
4749
)
4850

51+
@staticmethod
52+
def _sync_directories(from_directory, to_directory):
53+
"""Sync to_directory with from_directory by copying each file in
54+
to_directory with new contents. Files in to_directory will be
55+
overwritten by files of the same name in from_directory. We need to
56+
keep two copies of the log directory because otherwise TensorBoard
57+
picks up temp files from `aws s3 sync` and then stops reading the
58+
correct tfevent files. We walk the directory and copy each file
59+
individually because the directory that TensorBoard watches needs to
60+
always exist.
61+
62+
Args:
63+
from_directory (str): The directory with updated files.
64+
to_directory (str): The directory to be synced.
65+
"""
66+
if not os.path.exists(to_directory):
67+
os.mkdir(to_directory)
68+
for root, dirs, files in os.walk(from_directory):
69+
to_root = root.replace(from_directory, to_directory)
70+
for directory in dirs:
71+
to_child_dir = os.path.join(to_root, directory)
72+
if not os.path.exists(to_child_dir):
73+
os.mkdir(to_child_dir)
74+
for fname in files:
75+
from_file = os.path.join(root, fname)
76+
to_file = os.path.join(to_root, fname)
77+
with open(from_file, 'rb') as a, open(to_file, 'wb') as b:
78+
b.write(a.read())
79+
80+
@staticmethod
81+
@contextlib.contextmanager
82+
def _temporary_directory():
83+
"""Context manager for a temporary directory. This is similar to
84+
tempfile.TemporaryDirectory in python>=3.2.
85+
"""
86+
name = tempfile.mkdtemp()
87+
try:
88+
yield name
89+
finally:
90+
shutil.rmtree(name)
91+
4992
def validate_requirements(self):
5093
"""Ensure that TensorBoard and the AWS CLI are installed.
5194
@@ -96,10 +139,12 @@ def run(self):
96139
LOGGER.info('TensorBoard 0.1.7 at http://localhost:{}'.format(port))
97140
while not self.estimator.checkpoint_path:
98141
self.event.wait(1)
99-
while not self.event.is_set():
100-
args = ['aws', 's3', 'sync', self.estimator.checkpoint_path, self.logdir]
101-
subprocess.call(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
102-
self.event.wait(10)
142+
with self._temporary_directory() as aws_sync_dir:
143+
while not self.event.is_set():
144+
args = ['aws', 's3', 'sync', self.estimator.checkpoint_path, aws_sync_dir]
145+
subprocess.call(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
146+
self._sync_directories(aws_sync_dir, self.logdir)
147+
self.event.wait(10)
103148
tensorboard_process.terminate()
104149

105150

tests/unit/test_sync_directories.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
import filecmp
14+
import os
15+
import random
16+
import shutil
17+
18+
from sagemaker.tensorflow.estimator import Tensorboard
19+
20+
21+
def create_test_directory(directory, variable_content="hello world"):
22+
"""Create dummy data for testing Tensorboard._sync_directories with the
23+
following structure:
24+
25+
<directory>
26+
|_ child_directory
27+
|_ hello.txt
28+
|_ foo1.txt
29+
|_ foo2.txt
30+
31+
Args:
32+
directory (str): The path to a directory to create with fake files
33+
variable_content (str): Content to put in one of the files
34+
"""
35+
child_dir = os.path.join(directory, 'child_directory')
36+
os.mkdir(child_dir)
37+
with open(os.path.join(directory, 'foo1.txt'), 'w') as f:
38+
f.write('bar1')
39+
with open(os.path.join(directory, 'foo2.txt'), 'w') as f:
40+
f.write('bar2')
41+
with open(os.path.join(child_dir, 'hello.txt'), 'w') as f:
42+
f.write(variable_content)
43+
44+
45+
def same_dirs(a, b):
46+
"""Check that structure and files are the same for directories a and b
47+
48+
Args:
49+
a (str): The path to the first directory
50+
b (str): The path to the second directory
51+
"""
52+
comp = filecmp.dircmp(a, b)
53+
common = sorted(comp.common)
54+
left = sorted(comp.left_list)
55+
right = sorted(comp.right_list)
56+
if left != common or right != common:
57+
return False
58+
if len(comp.diff_files):
59+
return False
60+
for subdir in comp.common_dirs:
61+
left_subdir = os.path.join(a, subdir)
62+
right_subdir = os.path.join(b, subdir)
63+
return same_dirs(left_subdir, right_subdir)
64+
return True
65+
66+
67+
def test_to_directory_doesnt_exist():
68+
with Tensorboard._temporary_directory() as from_dir:
69+
create_test_directory(from_dir)
70+
to_dir = './not_a_real_place_{}'.format(random.getrandbits(64))
71+
Tensorboard._sync_directories(from_dir, to_dir)
72+
assert same_dirs(from_dir, to_dir)
73+
shutil.rmtree(to_dir)
74+
75+
76+
def test_only_root_of_to_directory_exists():
77+
with Tensorboard._temporary_directory() as from_dir:
78+
with Tensorboard._temporary_directory() as to_dir:
79+
create_test_directory(from_dir)
80+
assert not same_dirs(from_dir, to_dir)
81+
Tensorboard._sync_directories(from_dir, to_dir)
82+
assert same_dirs(from_dir, to_dir)
83+
84+
85+
def test_files_are_overwritten_when_they_already_exist():
86+
with Tensorboard._temporary_directory() as from_dir:
87+
with Tensorboard._temporary_directory() as to_dir:
88+
create_test_directory(from_dir)
89+
create_test_directory(to_dir, "foo bar")
90+
assert not same_dirs(from_dir, to_dir)
91+
Tensorboard._sync_directories(from_dir, to_dir)
92+
assert same_dirs(from_dir, to_dir)

0 commit comments

Comments
 (0)