Skip to content

fix: Add write permission to job output dirs for remote and step decorator running on non-root job user #4325

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from __future__ import absolute_import

import argparse
import getpass
import sys
import os
import shutil
Expand All @@ -38,6 +39,7 @@
REMOTE_FUNCTION_WORKSPACE = "sm_rf_user_ws"
BASE_CHANNEL_PATH = "/opt/ml/input/data"
FAILURE_REASON_PATH = "/opt/ml/output/failure"
JOB_OUTPUT_DIRS = ["/opt/ml/output", "/opt/ml/model", "/tmp"]
PRE_EXECUTION_SCRIPT_NAME = "pre_exec.sh"
JOB_REMOTE_FUNCTION_WORKSPACE = "sagemaker_remote_function_workspace"
SCRIPT_AND_DEPENDENCIES_CHANNEL_NAME = "pre_exec_script_and_dependencies"
Expand All @@ -63,6 +65,17 @@ def main(sys_args=None):

RuntimeEnvironmentManager()._validate_python_version(client_python_version, conda_env)

user = getpass.getuser()
if user != "root":
log_message = (
"The job is running on non-root user: %s. Adding write permissions to the "
"following job output directories: %s."
)
logger.info(log_message, user, JOB_OUTPUT_DIRS)
RuntimeEnvironmentManager().change_dir_permission(
dirs=JOB_OUTPUT_DIRS, new_permission="777"
)

if pipeline_execution_id:
_bootstrap_runtime_env_for_pipeline_step(
client_python_version, func_step_workspace, conda_env, dependency_settings
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,32 @@ def run_pre_exec_script(self, pre_exec_script_path: str):
pre_exec_script_path,
)

def change_dir_permission(self, dirs: list, new_permission: str):
"""Change the permission of given directories

Args:
dirs (list[str]): A list of directories for permission update.
new_permission (str): The new permission for the given directories.
"""

_ERROR_MSG_PREFIX = "Failed to change directory permissions due to: "
command = ["sudo", "chmod", "-R", new_permission] + dirs
logger.info("Executing '%s'.", " ".join(command))

try:
subprocess.run(command, check=True, stderr=subprocess.PIPE)
except subprocess.CalledProcessError as called_process_err:
err_msg = called_process_err.stderr.decode("utf-8")
raise RuntimeEnvironmentError(f"{_ERROR_MSG_PREFIX} {err_msg}")
except FileNotFoundError as file_not_found_err:
if "[Errno 2] No such file or directory: 'sudo'" in str(file_not_found_err):
raise RuntimeEnvironmentError(
f"{_ERROR_MSG_PREFIX} {file_not_found_err}. "
"Please contact the image owner to install 'sudo' in the job container "
"and provide sudo privilege to the container user."
)
raise RuntimeEnvironmentError(file_not_found_err)

def _is_file_exists(self, dependencies):
"""Check whether the dependencies file exists at the given location.

Expand Down
5 changes: 5 additions & 0 deletions tests/integ/sagemaker/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,12 @@
"RUN curl 'https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip' -o 'awscliv2.zip' \
&& unzip awscliv2.zip \
&& ./aws/install\n\n"
"RUN apt install sudo\n"
"RUN useradd -ms /bin/bash integ-test-user\n"
# Add the user to sudo group
"RUN usermod -aG sudo integ-test-user\n"
# Ensure passwords are not required for sudo group users
"RUN echo '%sudo ALL= (ALL) NOPASSWD:ALL' >> /etc/sudoers\n"
"USER integ-test-user\n"
"WORKDIR /home/integ-test-user\n"
"COPY {source_archive} ./\n"
Expand Down
19 changes: 19 additions & 0 deletions tests/integ/sagemaker/remote_function/test_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,25 @@ def cuberoot(x):
assert cuberoot(27) == 3


def test_with_user_and_workdir_set_in_the_image_client_error_case(
sagemaker_session, dummy_container_with_user_and_workdir, cpu_instance_type
):
client_error_message = "Testing client error in job."

@remote(
role=ROLE,
image_uri=dummy_container_with_user_and_workdir,
instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
)
def my_func():
raise RuntimeError(client_error_message)

with pytest.raises(RuntimeError) as error:
my_func()
assert client_error_message in str(error)


@pytest.mark.skip
def test_decorator_with_spark_job(sagemaker_session, cpu_instance_type):
@remote(
Expand Down
46 changes: 46 additions & 0 deletions tests/integ/sagemaker/workflow/test_step_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,52 @@ def cuberoot(x):
pass


def test_with_user_and_workdir_set_in_the_image_client_error_case(
sagemaker_session, role, pipeline_name, region_name, dummy_container_with_user_and_workdir
):
# This test aims to ensure client error in step decorated function
# can be successfully surfaced and the job can be failed.
os.environ["AWS_DEFAULT_REGION"] = region_name
client_error_message = "Testing client error in job."

@step(
role=role,
image_uri=dummy_container_with_user_and_workdir,
instance_type=INSTANCE_TYPE,
)
def my_func():
raise RuntimeError(client_error_message)

step_a = my_func()

pipeline = Pipeline(
name=pipeline_name,
steps=[step_a],
sagemaker_session=sagemaker_session,
)

try:
_, execution_steps = create_and_execute_pipeline(
pipeline=pipeline,
pipeline_name=pipeline_name,
region_name=region_name,
role=role,
no_of_steps=1,
last_step_name=get_step(step_a).name,
execution_parameters=dict(),
step_status="Failed",
)
assert (
f"ClientError: AlgorithmError: RuntimeError('{client_error_message}')"
in execution_steps[0]["FailureReason"]
)
finally:
try:
pipeline.delete()
except Exception:
pass


def test_step_level_serialization(
sagemaker_session, role, pipeline_name, region_name, dummy_container_without_error
):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@
# language governing permissions and limitations under the License.
from __future__ import absolute_import


from mock import patch
from mock.mock import MagicMock

from sagemaker.remote_function.runtime_environment.runtime_environment_manager import (
RuntimeEnvironmentError,
_DependencySettings,
Expand Down Expand Up @@ -78,14 +79,22 @@ def args_for_step():
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_remote_function"
)
def test_main_success_remote_job(
@patch("getpass.getuser", MagicMock(return_value="root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_success_remote_job_with_root_user(
change_dir_permission,
bootstrap_remote,
run_pre_exec_script,
bootstrap_runtime,
validate_python,
_exit_process,
):
bootstrap.main(args_for_remote())

change_dir_permission.assert_not_called()
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
bootstrap_remote.assert_called_once_with(
TEST_PYTHON_VERSION,
Expand Down Expand Up @@ -114,14 +123,21 @@ def test_main_success_remote_job(
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_pipeline_step"
)
def test_main_success_pipeline_step(
@patch("getpass.getuser", MagicMock(return_value="root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_success_pipeline_step_with_root_user(
change_dir_permission,
bootstrap_step,
run_pre_exec_script,
bootstrap_runtime,
validate_python,
_exit_process,
):
bootstrap.main(args_for_step())
change_dir_permission.assert_not_called()
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
bootstrap_step.assert_called_once_with(
TEST_PYTHON_VERSION,
Expand Down Expand Up @@ -150,14 +166,25 @@ def test_main_success_pipeline_step(
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_remote_function"
)
def test_main_failure_remote_job(
bootstrap_runtime, run_pre_exec_script, write_failure, _exit_process, validate_python
@patch("getpass.getuser", MagicMock(return_value="root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_failure_remote_job_with_root_user(
change_dir_permission,
bootstrap_runtime,
run_pre_exec_script,
write_failure,
_exit_process,
validate_python,
):
runtime_err = RuntimeEnvironmentError("some failure reason")
bootstrap_runtime.side_effect = runtime_err

bootstrap.main(args_for_remote())

change_dir_permission.assert_not_called()
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
run_pre_exec_script.assert_not_called()
bootstrap_runtime.assert_called()
Expand All @@ -181,21 +208,125 @@ def test_main_failure_remote_job(
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_pipeline_step"
)
def test_main_failure_pipeline_step(
bootstrap_runtime, run_pre_exec_script, write_failure, _exit_process, validate_python
@patch("getpass.getuser", MagicMock(return_value="root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_failure_pipeline_step_with_root_user(
change_dir_permission,
bootstrap_runtime,
run_pre_exec_script,
write_failure,
_exit_process,
validate_python,
):
runtime_err = RuntimeEnvironmentError("some failure reason")
bootstrap_runtime.side_effect = runtime_err

bootstrap.main(args_for_step())

change_dir_permission.assert_not_called()
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
run_pre_exec_script.assert_not_called()
bootstrap_runtime.assert_called()
write_failure.assert_called_with(str(runtime_err))
_exit_process.assert_called_with(1)


@patch("sys.exit")
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager._validate_python_version"
)
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.bootstrap"
)
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.run_pre_exec_script"
)
@patch(
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_remote_function"
)
@patch("getpass.getuser", MagicMock(return_value="non_root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_remote_job_with_non_root_user(
change_dir_permission,
bootstrap_remote,
run_pre_exec_script,
bootstrap_runtime,
validate_python,
_exit_process,
):
bootstrap.main(args_for_remote())

change_dir_permission.assert_called_once_with(
dirs=bootstrap.JOB_OUTPUT_DIRS, new_permission="777"
)
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
bootstrap_remote.assert_called_once_with(
TEST_PYTHON_VERSION,
TEST_JOB_CONDA_ENV,
_DependencySettings(TEST_DEPENDENCY_FILE_NAME),
)
run_pre_exec_script.assert_not_called()
bootstrap_runtime.assert_not_called()
_exit_process.assert_called_with(0)


@patch("sys.exit")
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager._validate_python_version"
)
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.bootstrap"
)
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.run_pre_exec_script"
)
@patch(
"sagemaker.remote_function.runtime_environment.bootstrap_runtime_environment."
"_bootstrap_runtime_env_for_pipeline_step"
)
@patch("getpass.getuser", MagicMock(return_value="non_root"))
@patch(
"sagemaker.remote_function.runtime_environment.runtime_environment_manager."
"RuntimeEnvironmentManager.change_dir_permission"
)
def test_main_pipeline_step_with_non_root_user(
change_dir_permission,
bootstrap_step,
run_pre_exec_script,
bootstrap_runtime,
validate_python,
_exit_process,
):
bootstrap.main(args_for_step())

change_dir_permission.assert_called_once_with(
dirs=bootstrap.JOB_OUTPUT_DIRS, new_permission="777"
)
validate_python.assert_called_once_with(TEST_PYTHON_VERSION, TEST_JOB_CONDA_ENV)
bootstrap_step.assert_called_once_with(
TEST_PYTHON_VERSION,
FUNC_STEP_WORKSPACE,
TEST_JOB_CONDA_ENV,
None,
)
run_pre_exec_script.assert_not_called()
bootstrap_runtime.assert_not_called()
_exit_process.assert_called_with(0)


@patch("shutil.unpack_archive")
@patch("os.getcwd", return_value=CURR_WORKING_DIR)
@patch("os.path.exists", return_value=True)
Expand Down
Loading