@@ -117,12 +117,16 @@ def train(self, input_data_config, hyperparameters):
117
117
# free up the training data directory as it may contain
118
118
# lots of data downloaded from S3. This doesn't delete any local
119
119
# data that was just mounted to the container.
120
- shutil . rmtree (data_dir )
120
+ _delete_tree (data_dir )
121
121
# Also free the container config files.
122
122
for host in self .hosts :
123
- shutil .rmtree (os .path .join (self .container_root , host ))
123
+ container_config_path = os .path .join (self .container_root , host )
124
+ _delete_tree (container_config_path )
124
125
125
126
self ._cleanup ()
127
+ # Print our Job Complete line to have a simmilar experience to training on SageMaker where you
128
+ # see this line at the end.
129
+ print ('===== Job Complete =====' )
126
130
return s3_model_artifacts
127
131
128
132
def serve (self , primary_container ):
@@ -162,7 +166,7 @@ def stop_serving(self):
162
166
self .container .down ()
163
167
self ._cleanup ()
164
168
# for serving we can delete everything in the container root.
165
- shutil . rmtree (self .container_root )
169
+ _delete_tree (self .container_root )
166
170
167
171
def retrieve_model_artifacts (self , compose_data ):
168
172
"""Get the model artifacts from all the container nodes.
@@ -185,9 +189,9 @@ def retrieve_model_artifacts(self, compose_data):
185
189
volumes = compose_data ['services' ][str (host )]['volumes' ]
186
190
187
191
for volume in volumes :
188
- container_dir , host_dir = volume .split (':' )
189
- if host_dir == '/opt/ml/model' :
190
- self ._recursive_copy (container_dir , s3_model_artifacts )
192
+ host_dir , container_dir = volume .split (':' )
193
+ if container_dir == '/opt/ml/model' :
194
+ self ._recursive_copy (host_dir , s3_model_artifacts )
191
195
192
196
return s3_model_artifacts
193
197
@@ -304,7 +308,7 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
304
308
return content
305
309
306
310
def _compose (self , detached = False ):
307
- compose_cmd = 'nvidia-docker-compose' if self . instance_type == "local_gpu" else ' docker-compose'
311
+ compose_cmd = 'docker-compose'
308
312
309
313
command = [
310
314
compose_cmd ,
@@ -480,6 +484,21 @@ def _create_config_file_directories(root, host):
480
484
os .makedirs (os .path .join (root , host , d ))
481
485
482
486
487
+ def _delete_tree (path ):
488
+ try :
489
+ shutil .rmtree (path )
490
+ except OSError as exc :
491
+ # on Linux, when docker writes to any mounted volume, it uses the container's user. In most cases
492
+ # this is root. When the container exits and we try to delete them we can't because root owns those
493
+ # files. We expect this to happen, so we handle EACCESS. Any other error we will raise the
494
+ # exception up.
495
+ if exc .errno == errno .EACCES :
496
+ logger .warning ("Failed to delete: %s Please remove it manually." % path )
497
+ else :
498
+ logger .error ("Failed to delete: %s" % path )
499
+ raise
500
+
501
+
483
502
def _aws_credentials (session ):
484
503
try :
485
504
creds = session .get_credentials ()
0 commit comments