Skip to content

pull updates #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jun 27, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
# Changelog

## v1.31.0 (2019-06-27)

### Features

* use deep learning images

### Bug fixes and other changes

* Update buildspec.yml
* allow only one integration test run per time
* remove unnecessary P3 tests from TFS integration tests
* add pytest.mark.local_mode annotation to broken tests

## v1.30.0 (2019-06-25)

### Features
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.30.1.dev0
1.31.1.dev0
10 changes: 8 additions & 2 deletions buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ phases:
commands:
# run linters
- tox -e flake8,pylint

# run package and docbuild checks
- tox -e twine
- tox -e sphinx
Expand All @@ -34,7 +33,14 @@ phases:
# run integration tests
- |
if has-matching-changes "tests/" "src/*.py" "setup.py" "setup.cfg" "buildspec.yml"; then
IGNORE_COVERAGE=- tox -e py36,py27 -- tests/integ -n 24 --boxed --reruns 2
python3 -u ci-scripts/queue_build.py
IGNORE_COVERAGE=- tox -e py36,py27 -- tests/integ -n 24 --reruns 3
else
echo "skipping integration tests"
fi
post_build:
finally:
- FILENAME=$(ls ci-lock/)
- ACCOUNT=$(aws sts get-caller-identity --output text | awk '{print $1}')
- S3_BUCKET_DIR=s3://sagemaker-us-west-2-${ACCOUNT}/ci-lock/
- aws s3 rm ${S3_BUCKET_DIR}${FILENAME}
117 changes: 117 additions & 0 deletions ci-scripts/queue_build.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import

import os
import time
import boto3

account = boto3.client("sts").get_caller_identity()["Account"]
bucket_name = "sagemaker-us-west-2-%s" % account


def queue_build():
build_id = os.environ.get("CODEBUILD_BUILD_ID", "CODEBUILD-BUILD-ID")
source_version = os.environ.get("CODEBUILD_SOURCE_VERSION", "CODEBUILD-SOURCE-VERSION").replace(
"/", "-"
)
ticket_number = int(1000 * time.time())
filename = "%s_%s_%s" % (ticket_number, build_id, source_version)

print("Created queue ticket %s" % ticket_number)

_write_ticket(filename)
files = _list_tickets()
_cleanup_tickets_older_than_8_hours(files)
_wait_for_other_builds(files, ticket_number)


def _build_info_from_file(file):
filename = file.key.split("/")[1]
ticket_number, build_id, source_version = filename.split("_")
return int(ticket_number), build_id, source_version


def _wait_for_other_builds(files, ticket_number):
newfiles = list(filter(lambda file: not _file_older_than(file), files))
sorted_files = list(sorted(newfiles, key=lambda y: y.key))

print("build queue status:")
print()

for order, file in enumerate(sorted_files):
file_ticket_number, build_id, source_version = _build_info_from_file(file)
print(
"%s -> %s %s, ticket number: %s" % (order, build_id, source_version, file_ticket_number)
)

for file in sorted_files:
file_ticket_number, build_id, source_version = _build_info_from_file(file)

if file_ticket_number == ticket_number:

break
else:
while True:
client = boto3.client("codebuild")
response = client.batch_get_builds(ids=[build_id])
build_status = response["builds"][0]["buildStatus"]

if build_status == "IN_PROGRESS":
print(
"waiting on build %s %s %s" % (build_id, source_version, file_ticket_number)
)
time.sleep(30)
else:
print("build %s finished, deleting lock" % build_id)
file.delete()
break


def _cleanup_tickets_older_than_8_hours(files):
oldfiles = list(filter(_file_older_than, files))
for file in oldfiles:
print("object %s older than 8 hours. Deleting" % file.key)
file.delete()
return files


def _list_tickets():
s3 = boto3.resource("s3")
bucket = s3.Bucket(bucket_name)
objects = [file for file in bucket.objects.filter(Prefix="ci-lock/")]
files = list(filter(lambda x: x != "ci-lock/", objects))
return files


def _file_older_than(file):
timelimit = 1000 * 60 * 60 * 8

file_ticket_number, build_id, source_version = _build_info_from_file(file)

return int(time.time()) - file_ticket_number > timelimit


def _write_ticket(ticket_number):

if not os.path.exists("ci-lock"):
os.mkdir("ci-lock")

filename = "ci-lock/" + ticket_number
with open(filename, "w") as file:
file.write(ticket_number)
boto3.Session().resource("s3").Object(bucket_name, filename).upload_file(filename)


if __name__ == "__main__":
queue_build()
72 changes: 68 additions & 4 deletions src/sagemaker/fw_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,56 @@
VALID_EIA_FRAMEWORKS = ["tensorflow", "tensorflow-serving", "mxnet", "mxnet-serving"]
VALID_ACCOUNTS_BY_REGION = {"us-gov-west-1": "246785580436", "us-iso-east-1": "744548109606"}

MERGED_FRAMEWORKS_REPO_MAP = {
"tensorflow-scriptmode": "tensorflow-training",
"mxnet": "mxnet-training",
"tensorflow-serving": "tensorflow-inference",
"mxnet-serving": "mxnet-inference",
}

MERGED_FRAMEWORKS_LOWEST_VERSIONS = {
"tensorflow-scriptmode": [1, 13, 1],
"mxnet": [1, 4, 1],
"tensorflow-serving": [1, 13, 0],
"mxnet-serving": [1, 4, 1],
}


def is_version_equal_or_higher(lowest_version, framework_version):
"""Determine whether the ``framework_version`` is equal to or higher than ``lowest_version``

Args:
lowest_version (List[int]): lowest version represented in an integer list
framework_version (str): framework version string

Returns:
bool: Whether or not framework_version is equal to or higher than lowest_version
"""
version_list = [int(s) for s in framework_version.split(".")]
return version_list >= lowest_version[0 : len(version_list)]


def _is_merged_versions(framework, framework_version):
lowest_version_list = MERGED_FRAMEWORKS_LOWEST_VERSIONS.get(framework)
if lowest_version_list:
return is_version_equal_or_higher(lowest_version_list, framework_version)
else:
return False


def _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
is_gov_region = region in VALID_ACCOUNTS_BY_REGION
is_py3 = py_version == "py3" or py_version is None
is_merged_versions = _is_merged_versions(framework, framework_version)
return (not is_gov_region) and is_merged_versions and is_py3 and accelerator_type is None


def _registry_id(region, framework, py_version, account, accelerator_type, framework_version):
if _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
return "763104351884"
else:
return VALID_ACCOUNTS_BY_REGION.get(region, account)


def create_image_uri(
region,
Expand Down Expand Up @@ -86,8 +136,15 @@ def create_image_uri(
if py_version and py_version not in VALID_PY_VERSIONS:
raise ValueError("invalid py_version argument: {}".format(py_version))

# Handle Account Number for Gov Cloud
account = VALID_ACCOUNTS_BY_REGION.get(region, account)
# Handle Account Number for Gov Cloud and frameworks with DLC merged images
account = _registry_id(
region=region,
framework=framework,
py_version=py_version,
account=account,
accelerator_type=accelerator_type,
framework_version=framework_version,
)

# Handle Local Mode
if instance_type.startswith("local"):
Expand Down Expand Up @@ -121,7 +178,14 @@ def create_image_uri(
):
framework += "-eia"

return "{}/sagemaker-{}:{}".format(get_ecr_image_uri_prefix(account, region), framework, tag)
if _using_merged_images(region, framework, py_version, accelerator_type, framework_version):
return "{}/{}:{}".format(
get_ecr_image_uri_prefix(account, region), MERGED_FRAMEWORKS_REPO_MAP[framework], tag
)
else:
return "{}/sagemaker-{}:{}".format(
get_ecr_image_uri_prefix(account, region), framework, tag
)


def _accelerator_type_valid_for_framework(
Expand Down Expand Up @@ -264,7 +328,7 @@ def framework_name_from_image(image_name):
# extract framework, python version and image tag
# We must support both the legacy and current image name format.
name_pattern = re.compile(
r"^sagemaker(?:-rl)?-(tensorflow|mxnet|chainer|pytorch|scikit-learn)(?:-)?(scriptmode)?:(.*)-(.*?)-(py2|py3)$" # noqa: E501
r"^(?:sagemaker(?:-rl)?-)?(tensorflow|mxnet|chainer|pytorch|scikit-learn)(?:-)?(scriptmode|training)?:(.*)-(.*?)-(py2|py3)$" # noqa: E501
)
legacy_name_pattern = re.compile(r"^sagemaker-(tensorflow|mxnet)-(py2|py3)-(cpu|gpu):(.*)$")

Expand Down
3 changes: 3 additions & 0 deletions tests/integ/test_git.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import os

import numpy
import pytest
import tempfile

from tests.integ import lock as lock
Expand All @@ -30,6 +31,7 @@
LOCK_PATH = os.path.join(tempfile.gettempdir(), "sagemaker_test_git_lock")


@pytest.mark.local_mode
def test_git_support_with_pytorch(sagemaker_local_session):
script_path = "mnist.py"
data_path = os.path.join(DATA_DIR, "pytorch_mnist")
Expand Down Expand Up @@ -59,6 +61,7 @@ def test_git_support_with_pytorch(sagemaker_local_session):
predictor.delete_endpoint()


@pytest.mark.local_mode
def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version):
script_path = "mnist.py"
data_path = os.path.join(DATA_DIR, "mxnet_mnist")
Expand Down
4 changes: 4 additions & 0 deletions tests/integ/test_tf_script_mode.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def test_mnist(sagemaker_session, instance_type):
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
py_version=tests.integ.PYTHON_VERSION,
metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}],
)
inputs = estimator.sagemaker_session.upload_data(
Expand Down Expand Up @@ -98,6 +99,7 @@ def test_server_side_encryption(sagemaker_session):
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
py_version=tests.integ.PYTHON_VERSION,
code_location=output_path,
output_path=output_path,
model_dir="/opt/ml/model",
Expand Down Expand Up @@ -144,6 +146,7 @@ def test_mnist_async(sagemaker_session):
role=ROLE,
train_instance_count=1,
train_instance_type="ml.c5.4xlarge",
py_version=tests.integ.PYTHON_VERSION,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
Expand Down Expand Up @@ -182,6 +185,7 @@ def test_deploy_with_input_handlers(sagemaker_session, instance_type):
role=ROLE,
train_instance_count=1,
train_instance_type=instance_type,
py_version=tests.integ.PYTHON_VERSION,
sagemaker_session=sagemaker_session,
script_mode=True,
framework_version=TensorFlow.LATEST_VERSION,
Expand Down
27 changes: 4 additions & 23 deletions tests/integ/test_tfs.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,25 +26,8 @@
from sagemaker.tensorflow.serving import Model, Predictor


@pytest.fixture(
scope="session",
params=[
"ml.c5.xlarge",
pytest.param(
"ml.p3.2xlarge",
marks=pytest.mark.skipif(
tests.integ.test_region() in tests.integ.HOSTING_NO_P3_REGIONS,
reason="no ml.p3 instances in this region",
),
),
],
)
def instance_type(request):
return request.param


@pytest.fixture(scope="module")
def tfs_predictor(instance_type, sagemaker_session, tf_full_version):
def tfs_predictor(sagemaker_session, tf_full_version):
endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-tensorflow-serving")
model_data = sagemaker_session.upload_data(
path=os.path.join(tests.integ.DATA_DIR, "tensorflow-serving-test-model.tar.gz"),
Expand All @@ -57,7 +40,7 @@ def tfs_predictor(instance_type, sagemaker_session, tf_full_version):
framework_version=tf_full_version,
sagemaker_session=sagemaker_session,
)
predictor = model.deploy(1, instance_type, endpoint_name=endpoint_name)
predictor = model.deploy(1, "ml.c5.xlarge", endpoint_name=endpoint_name)
yield predictor


Expand Down Expand Up @@ -130,8 +113,6 @@ def tfs_predictor_with_model_and_entry_point_and_dependencies(
@pytest.fixture(scope="module")
def tfs_predictor_with_accelerator(sagemaker_session, tf_full_version):
endpoint_name = sagemaker.utils.unique_name_from_base("sagemaker-tensorflow-serving")
instance_type = "ml.c4.large"
accelerator_type = "ml.eia1.medium"
model_data = sagemaker_session.upload_data(
path=os.path.join(tests.integ.DATA_DIR, "tensorflow-serving-test-model.tar.gz"),
key_prefix="tensorflow-serving/models",
Expand All @@ -144,13 +125,13 @@ def tfs_predictor_with_accelerator(sagemaker_session, tf_full_version):
sagemaker_session=sagemaker_session,
)
predictor = model.deploy(
1, instance_type, endpoint_name=endpoint_name, accelerator_type=accelerator_type
1, "ml.c4.large", endpoint_name=endpoint_name, accelerator_type="ml.eia1.medium"
)
yield predictor


@pytest.mark.canary_quick
def test_predict(tfs_predictor, instance_type): # pylint: disable=W0613
def test_predict(tfs_predictor): # pylint: disable=W0613
input_data = {"instances": [1.0, 2.0, 5.0]}
expected_result = {"predictions": [3.5, 4.0, 5.5]}

Expand Down
Loading