Skip to content

fix: Fix bug forcing uploaded tar to be named sourcedir #3412

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Dec 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 11 additions & 8 deletions src/sagemaker/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -1587,13 +1587,13 @@ def run( # type: ignore[override]
framework script to run.Path (absolute or relative) to the local
Python source file which should be executed as the entry point
to training. When `code` is an S3 URI, ignore `source_dir`,
`dependencies, and `git_config`. If ``source_dir`` is specified,
`dependencies`, and `git_config`. If ``source_dir`` is specified,
then ``code`` must point to a file located at the root of ``source_dir``.
source_dir (str): Path (absolute, relative or an S3 URI) to a directory
with any other processing source code dependencies aside from the entry
point file (default: None). If ``source_dir`` is an S3 URI, it must
point to a tar.gz file. Structure within this directory are preserved
when processing on Amazon SageMaker (default: None).
point to a file named `sourcedir.tar.gz`. Structure within this directory
are preserved when processing on Amazon SageMaker (default: None).
dependencies (list[str]): A list of paths to directories (absolute
or relative) with any additional libraries that will be exported
to the container (default: []). The library folders will be
Expand Down Expand Up @@ -1730,12 +1730,15 @@ def _pack_and_upload_code(
"sagemaker_session unspecified when creating your Processor to have one set up "
"automatically."
)
if "/sourcedir.tar.gz" in estimator.uploaded_code.s3_prefix:
# Upload the bootstrapping code as s3://.../jobname/source/runproc.sh.
entrypoint_s3_uri = estimator.uploaded_code.s3_prefix.replace(
"sourcedir.tar.gz",
"runproc.sh",
)
else:
raise RuntimeError("S3 source_dir file must be named `sourcedir.tar.gz.`")

# Upload the bootstrapping code as s3://.../jobname/source/runproc.sh.
entrypoint_s3_uri = estimator.uploaded_code.s3_prefix.replace(
"sourcedir.tar.gz",
"runproc.sh",
)
script = estimator.uploaded_code.script_name
s3_runproc_sh = S3Uploader.upload_string_as_file_body(
self._generate_framework_script(script),
Expand Down
20 changes: 20 additions & 0 deletions tests/integ/test_xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,26 @@ def xgboost_training_job(
)


def test_sourcedir_naming(
sagemaker_session,
xgboost_latest_version,
xgboost_latest_py_version,
cpu_instance_type,
):
with pytest.raises(RuntimeError):
processor = XGBoostProcessor(
framework_version=xgboost_latest_version,
role=ROLE,
instance_count=1,
instance_type=cpu_instance_type,
sagemaker_session=sagemaker_session,
)
processor.run(
source_dir="s3://bucket/deps.tar.gz",
code="main_script.py",
)


@pytest.mark.release
def test_framework_processing_job_with_deps(
sagemaker_session,
Expand Down