Skip to content

Commit 108300f

Browse files
qidewenwhenroot
authored andcommitted
fix: Add write permission to job output dirs for remote and step decorator running on non-root job user (aws#4325)
1 parent 250767c commit 108300f

File tree

7 files changed

+298
-7
lines changed

7 files changed

+298
-7
lines changed

src/sagemaker/remote_function/runtime_environment/bootstrap_runtime_environment.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from __future__ import absolute_import
1515

1616
import argparse
17+
import getpass
1718
import sys
1819
import os
1920
import shutil
@@ -38,6 +39,7 @@
3839
REMOTE_FUNCTION_WORKSPACE = "sm_rf_user_ws"
3940
BASE_CHANNEL_PATH = "/opt/ml/input/data"
4041
FAILURE_REASON_PATH = "/opt/ml/output/failure"
42+
JOB_OUTPUT_DIRS = ["/opt/ml/output", "/opt/ml/model", "/tmp"]
4143
PRE_EXECUTION_SCRIPT_NAME = "pre_exec.sh"
4244
JOB_REMOTE_FUNCTION_WORKSPACE = "sagemaker_remote_function_workspace"
4345
SCRIPT_AND_DEPENDENCIES_CHANNEL_NAME = "pre_exec_script_and_dependencies"
@@ -63,6 +65,17 @@ def main(sys_args=None):
6365

6466
RuntimeEnvironmentManager()._validate_python_version(client_python_version, conda_env)
6567

68+
user = getpass.getuser()
69+
if user != "root":
70+
log_message = (
71+
"The job is running on non-root user: %s. Adding write permissions to the "
72+
"following job output directories: %s."
73+
)
74+
logger.info(log_message, user, JOB_OUTPUT_DIRS)
75+
RuntimeEnvironmentManager().change_dir_permission(
76+
dirs=JOB_OUTPUT_DIRS, new_permission="777"
77+
)
78+
6679
if pipeline_execution_id:
6780
_bootstrap_runtime_env_for_pipeline_step(
6881
client_python_version, func_step_workspace, conda_env, dependency_settings

src/sagemaker/remote_function/runtime_environment/runtime_environment_manager.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -216,6 +216,32 @@ def run_pre_exec_script(self, pre_exec_script_path: str):
216216
pre_exec_script_path,
217217
)
218218

219+
def change_dir_permission(self, dirs: list, new_permission: str):
220+
"""Change the permission of given directories
221+
222+
Args:
223+
dirs (list[str]): A list of directories for permission update.
224+
new_permission (str): The new permission for the given directories.
225+
"""
226+
227+
_ERROR_MSG_PREFIX = "Failed to change directory permissions due to: "
228+
command = ["sudo", "chmod", "-R", new_permission] + dirs
229+
logger.info("Executing '%s'.", " ".join(command))
230+
231+
try:
232+
subprocess.run(command, check=True, stderr=subprocess.PIPE)
233+
except subprocess.CalledProcessError as called_process_err:
234+
err_msg = called_process_err.stderr.decode("utf-8")
235+
raise RuntimeEnvironmentError(f"{_ERROR_MSG_PREFIX} {err_msg}")
236+
except FileNotFoundError as file_not_found_err:
237+
if "[Errno 2] No such file or directory: 'sudo'" in str(file_not_found_err):
238+
raise RuntimeEnvironmentError(
239+
f"{_ERROR_MSG_PREFIX} {file_not_found_err}. "
240+
"Please contact the image owner to install 'sudo' in the job container "
241+
"and provide sudo privilege to the container user."
242+
)
243+
raise RuntimeEnvironmentError(file_not_found_err)
244+
219245
def _is_file_exists(self, dependencies):
220246
"""Check whether the dependencies file exists at the given location.
221247

tests/integ/sagemaker/conftest.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,12 @@
6868
"RUN curl 'https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip' -o 'awscliv2.zip' \
6969
&& unzip awscliv2.zip \
7070
&& ./aws/install\n\n"
71+
"RUN apt install sudo\n"
7172
"RUN useradd -ms /bin/bash integ-test-user\n"
73+
# Add the user to sudo group
74+
"RUN usermod -aG sudo integ-test-user\n"
75+
# Ensure passwords are not required for sudo group users
76+
"RUN echo '%sudo ALL= (ALL) NOPASSWD:ALL' >> /etc/sudoers\n"
7277
"USER integ-test-user\n"
7378
"WORKDIR /home/integ-test-user\n"
7479
"COPY {source_archive} ./\n"

tests/integ/sagemaker/remote_function/test_decorator.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,25 @@ def cuberoot(x):
747747
assert cuberoot(27) == 3
748748

749749

750+
def test_with_user_and_workdir_set_in_the_image_client_error_case(
751+
sagemaker_session, dummy_container_with_user_and_workdir, cpu_instance_type
752+
):
753+
client_error_message = "Testing client error in job."
754+
755+
@remote(
756+
role=ROLE,
757+
image_uri=dummy_container_with_user_and_workdir,
758+
instance_type=cpu_instance_type,
759+
sagemaker_session=sagemaker_session,
760+
)
761+
def my_func():
762+
raise RuntimeError(client_error_message)
763+
764+
with pytest.raises(RuntimeError) as error:
765+
my_func()
766+
assert client_error_message in str(error)
767+
768+
750769
@pytest.mark.skip
751770
def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type):
752771
@remote(

tests/integ/sagemaker/workflow/test_step_decorator.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -860,6 +860,52 @@ def cuberoot(x):
860860
pass
861861

862862

863+
def test_with_user_and_workdir_set_in_the_image_client_error_case(
864+
sagemaker_session, role, pipeline_name, region_name, dummy_container_with_user_and_workdir
865+
):
866+
# This test aims to ensure client error in step decorated function
867+
# can be successfully surfaced and the job can be failed.
868+
os.environ["AWS_DEFAULT_REGION"] = region_name
869+
client_error_message = "Testing client error in job."
870+
871+
@step(
872+
role=role,
873+
image_uri=dummy_container_with_user_and_workdir,
874+
instance_type=INSTANCE_TYPE,
875+
)
876+
def my_func():
877+
raise RuntimeError(client_error_message)
878+
879+
step_a = my_func()
880+
881+
pipeline = Pipeline(
882+
name=pipeline_name,
883+
steps=[step_a],
884+
sagemaker_session=sagemaker_session,
885+
)
886+
887+
try:
888+
_, execution_steps = create_and_execute_pipeline(
889+
pipeline=pipeline,
890+
pipeline_name=pipeline_name,
891+
region_name=region_name,
892+
role=role,
893+
no_of_steps=1,
894+
last_step_name=get_step(step_a).name,
895+
execution_parameters=dict(),
896+
step_status="Failed",
897+
)
898+
assert (
899+
f"ClientError: AlgorithmError: RuntimeError('{client_error_message}')"
900+
in execution_steps[0]["FailureReason"]
901+
)
902+
finally:
903+
try:
904+
pipeline.delete()
905+
except Exception:
906+
pass
907+
908+
863909
def test_step_level_serialization(
864910
sagemaker_session, role, pipeline_name, region_name, dummy_container_without_error
865911
):

tests/unit/sagemaker/remote_function/runtime_environment/test_bootstrap_runtime_environment.py

Lines changed: 138 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
# language governing permissions and limitations under the License.
1313
from __future__ import absolute_import
1414

15-
1615
from mock import patch
16+
from mock.mock import MagicMock
17+
1718
from sagemaker.remote_function.runtime_environment.runtime_environment_manager import (
1819
RuntimeEnvironmentError,
1920
_DependencySettings,
@@ -78,14 +79,22 @@ def args_for_step():
7879
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
7980
"_bootstrap_runtime_env_for_remote_function"
8081
)
81-
def test_main_success_remote_job(
82+
@patch("getpass.getuser", MagicMock(return_value="root"))
83+
@patch(
84+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
85+
"RuntimeEnvironmentManager.change_dir_permission"
86+
)
87+
def test_main_success_remote_job_with_root_user(
88+
change_dir_permission,
8289
bootstrap_remote,
8390
run_pre_exec_script,
8491
bootstrap_runtime,
8592
validate_python,
8693
_exit_process,
8794
):
8895
bootstrap.main(args_for_remote())
96+
97+
change_dir_permission.assert_not_called()
8998
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
9099
bootstrap_remote.assert_called_once_with(
91100
TEST_PYTHON_VERSION,
@@ -114,14 +123,21 @@ def test_main_success_remote_job(
114123
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
115124
"_bootstrap_runtime_env_for_pipeline_step"
116125
)
117-
def test_main_success_pipeline_step(
126+
@patch("getpass.getuser", MagicMock(return_value="root"))
127+
@patch(
128+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
129+
"RuntimeEnvironmentManager.change_dir_permission"
130+
)
131+
def test_main_success_pipeline_step_with_root_user(
132+
change_dir_permission,
118133
bootstrap_step,
119134
run_pre_exec_script,
120135
bootstrap_runtime,
121136
validate_python,
122137
_exit_process,
123138
):
124139
bootstrap.main(args_for_step())
140+
change_dir_permission.assert_not_called()
125141
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
126142
bootstrap_step.assert_called_once_with(
127143
TEST_PYTHON_VERSION,
@@ -150,14 +166,25 @@ def test_main_success_pipeline_step(
150166
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
151167
"_bootstrap_runtime_env_for_remote_function"
152168
)
153-
def test_main_failure_remote_job(
154-
bootstrap_runtime, run_pre_exec_script, write_failure, _exit_process, validate_python
169+
@patch("getpass.getuser", MagicMock(return_value="root"))
170+
@patch(
171+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
172+
"RuntimeEnvironmentManager.change_dir_permission"
173+
)
174+
def test_main_failure_remote_job_with_root_user(
175+
change_dir_permission,
176+
bootstrap_runtime,
177+
run_pre_exec_script,
178+
write_failure,
179+
_exit_process,
180+
validate_python,
155181
):
156182
runtime_err = RuntimeEnvironmentError("some failure reason")
157183
bootstrap_runtime.side_effect = runtime_err
158184

159185
bootstrap.main(args_for_remote())
160186

187+
change_dir_permission.assert_not_called()
161188
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
162189
run_pre_exec_script.assert_not_called()
163190
bootstrap_runtime.assert_called()
@@ -181,21 +208,125 @@ def test_main_failure_remote_job(
181208
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
182209
"_bootstrap_runtime_env_for_pipeline_step"
183210
)
184-
def test_main_failure_pipeline_step(
185-
bootstrap_runtime, run_pre_exec_script, write_failure, _exit_process, validate_python
211+
@patch("getpass.getuser", MagicMock(return_value="root"))
212+
@patch(
213+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
214+
"RuntimeEnvironmentManager.change_dir_permission"
215+
)
216+
def test_main_failure_pipeline_step_with_root_user(
217+
change_dir_permission,
218+
bootstrap_runtime,
219+
run_pre_exec_script,
220+
write_failure,
221+
_exit_process,
222+
validate_python,
186223
):
187224
runtime_err = RuntimeEnvironmentError("some failure reason")
188225
bootstrap_runtime.side_effect = runtime_err
189226

190227
bootstrap.main(args_for_step())
191228

229+
change_dir_permission.assert_not_called()
192230
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
193231
run_pre_exec_script.assert_not_called()
194232
bootstrap_runtime.assert_called()
195233
write_failure.assert_called_with(str(runtime_err))
196234
_exit_process.assert_called_with(1)
197235

198236

237+
@patch("sys.exit")
238+
@patch(
239+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
240+
"RuntimeEnvironmentManager._validate_python_version"
241+
)
242+
@patch(
243+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
244+
"RuntimeEnvironmentManager.bootstrap"
245+
)
246+
@patch(
247+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
248+
"RuntimeEnvironmentManager.run_pre_exec_script"
249+
)
250+
@patch(
251+
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
252+
"_bootstrap_runtime_env_for_remote_function"
253+
)
254+
@patch("getpass.getuser", MagicMock(return_value="non_root"))
255+
@patch(
256+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
257+
"RuntimeEnvironmentManager.change_dir_permission"
258+
)
259+
def test_main_remote_job_with_non_root_user(
260+
change_dir_permission,
261+
bootstrap_remote,
262+
run_pre_exec_script,
263+
bootstrap_runtime,
264+
validate_python,
265+
_exit_process,
266+
):
267+
bootstrap.main(args_for_remote())
268+
269+
change_dir_permission.assert_called_once_with(
270+
dirs=bootstrap.JOB_OUTPUT_DIRS, new_permission="777"
271+
)
272+
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
273+
bootstrap_remote.assert_called_once_with(
274+
TEST_PYTHON_VERSION,
275+
TEST_JOB_CONDA_ENV,
276+
_DependencySettings(TEST_DEPENDENCY_FILE_NAME),
277+
)
278+
run_pre_exec_script.assert_not_called()
279+
bootstrap_runtime.assert_not_called()
280+
_exit_process.assert_called_with(0)
281+
282+
283+
@patch("sys.exit")
284+
@patch(
285+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
286+
"RuntimeEnvironmentManager._validate_python_version"
287+
)
288+
@patch(
289+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
290+
"RuntimeEnvironmentManager.bootstrap"
291+
)
292+
@patch(
293+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
294+
"RuntimeEnvironmentManager.run_pre_exec_script"
295+
)
296+
@patch(
297+
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
298+
"_bootstrap_runtime_env_for_pipeline_step"
299+
)
300+
@patch("getpass.getuser", MagicMock(return_value="non_root"))
301+
@patch(
302+
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
303+
"RuntimeEnvironmentManager.change_dir_permission"
304+
)
305+
def test_main_pipeline_step_with_non_root_user(
306+
change_dir_permission,
307+
bootstrap_step,
308+
run_pre_exec_script,
309+
bootstrap_runtime,
310+
validate_python,
311+
_exit_process,
312+
):
313+
bootstrap.main(args_for_step())
314+
315+
change_dir_permission.assert_called_once_with(
316+
dirs=bootstrap.JOB_OUTPUT_DIRS, new_permission="777"
317+
)
318+
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
319+
bootstrap_step.assert_called_once_with(
320+
TEST_PYTHON_VERSION,
321+
FUNC_STEP_WORKSPACE,
322+
TEST_JOB_CONDA_ENV,
323+
None,
324+
)
325+
run_pre_exec_script.assert_not_called()
326+
bootstrap_runtime.assert_not_called()
327+
_exit_process.assert_called_with(0)
328+
329+
199330
@patch("shutil.unpack_archive")
200331
@patch("os.getcwd", return_value=CURR_WORKING_DIR)
201332
@patch("os.path.exists", return_value=True)

0 commit comments

Comments
 (0)