modified overview.rst, add lock for tests

Yue Tu · Yue Tu · commit c1bae10b8b89 · 2019-06-19T13:07:06.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -25,4 +25,4 @@ venv/
 *~
 .pytest_cache/
 *.swp
-tests/data/local_mode_lock
+.docker/
diff --git a/.python-version b/.python-version
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Changelog
 
+## v1.28.2 (2019-06-19)
+
+### Bug fixes and other changes
+
+ * prevent race condition in vpc tests
+
 ## v1.28.1 (2019-06-17)
 
 ### Bug fixes and other changes
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-1.28.2.dev0
+1.28.3.dev0
diff --git a/doc/overview.rst b/doc/overview.rst
@@ -88,7 +88,7 @@ Git Support
 ~~~~~~~~~~~
 If you have your training scripts in your GitHub repository, you can use them directly without the trouble to download
 them to local machine. Git support can be enabled simply by providing ``git_config`` parameter when initializing an
-estimator. If git support is enabled, then ``entry_point``, ``source_dir`` and  ``dependencies`` should all be relative
+estimator. If Git support is enabled, then ``entry_point``, ``source_dir`` and  ``dependencies`` should all be relative
 paths in the Git repo. Note that if you decided to use Git support, then everything you need for ``entry_point``,
 ``source_dir`` and ``dependencies`` should be in a single Git repo.
 
@@ -97,18 +97,18 @@ Here are ways to specify ``git_config``:
 .. code:: python
 
         # Specifies the git_config parameter
-        git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git',
+        git_config = {'repo': 'https://github.com/username/repo-with-training-scripts.git',
                       'branch': 'branch1',
                       'commit': '4893e528afa4a790331e1b5286954f073b0f14a2'}
 
         # Alternatively, you can also specify git_config by providing only 'repo' and 'branch'.
         # If this is the case, the latest commit in the branch will be used.
-        git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git',
+        git_config = {'repo': 'https://github.com/username/repo-with-training-scripts.git',
                       'branch': 'branch1'}
 
         # Only providing 'repo' is also allowed. If this is the case, latest commit in
         # 'master' branch will be used.
-        git_config = {'repo': 'https://github.com/GaryTu1020/python-sdk-testing.git'
+        git_config = {'repo': 'https://github.com/username/repo-with-training-scripts.git'
 
 The following are some examples to define estimators with Git support:
 
@@ -121,17 +121,15 @@ The following are some examples to define estimators with Git support:
                                     source_dir='pytorch',
                                     git_config=git_config,
                                     train_instance_count=1,
-                                    train_instance_type='ml.c4.xlarge',
-                                    sagemaker_session=sagemaker_local_session)
+                                    train_instance_type='ml.c4.xlarge')
 
         # In this example, the entry point 'mnist.py' is all we need for source code.
         # We need to specify the path to it in the Git repo.
         mx_estimator = MXNet(entry_point='mxnet/mnist.py',
                                     role='SageMakerRole',
                                     git_config=git_config,
                                     train_instance_count=1,
-                                    train_instance_type='ml.c4.xlarge',
-                                    sagemaker_session=sagemaker_local_session)
+                                    train_instance_type='ml.c4.xlarge')
 
         # In this example, besides entry point and other source code in source directory, we still need some
         # dependencies for the training job. Dependencies should also be paths inside the Git repo.
@@ -141,8 +139,7 @@ The following are some examples to define estimators with Git support:
                                     dependencies=['dep.py', 'foo/bar.py'],
                                     git_config=git_config,
                                     train_instance_count=1,
-                                    train_instance_type='ml.c4.xlarge',
-                                    sagemaker_session=sagemaker_local_session)
+                                    train_instance_type='ml.c4.xlarge')
 
 When Git support is enabled, users can still use local mode in the same way.
 
diff --git a/tests/integ/lock.py b/tests/integ/lock.py
@@ -15,24 +15,25 @@
 import fcntl
 import os
 import time
+import tempfile
 from contextlib import contextmanager
 
-import tests.integ
-
-LOCK_PATH = os.path.join(tests.integ.DATA_DIR, 'local_mode_lock')
+DEFAULT_LOCK_PATH = os.path.join(tempfile.gettempdir(), 'sagemaker_test_lock')
 
 
 @contextmanager
-def lock():
-    # Since Local Mode uses the same port for serving, we need a lock in order
-    # to allow concurrent test execution.
-    local_mode_lock_fd = open(LOCK_PATH, 'w')
-    local_mode_lock = local_mode_lock_fd.fileno()
+def lock(path=DEFAULT_LOCK_PATH):
+    """Create a file lock to control concurrent test execution. Certain tests or
+    test operations need to limit concurrency to work reliably. Examples include
+    local mode endpoint tests and vpc creation tests.
+    """
+    f = open(path, 'w')
+    fd = f.fileno()
 
-    fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
+    fcntl.lockf(fd, fcntl.LOCK_EX)
 
     try:
         yield
     finally:
         time.sleep(5)
-        fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
+        fcntl.lockf(fd, fcntl.LOCK_UN)
diff --git a/tests/integ/test_git.py b/tests/integ/test_git.py
@@ -15,14 +15,17 @@
 import os
 
 import numpy
+import tempfile
 
+from tests.integ import lock as lock
 from sagemaker.mxnet.estimator import MXNet
 from sagemaker.pytorch.estimator import PyTorch
 from tests.integ import DATA_DIR, PYTHON_VERSION
 
 GIT_REPO = 'https://github.com/aws/sagemaker-python-sdk.git'
 BRANCH = 'test-branch-git-config'
 COMMIT = '329bfcf884482002c05ff7f44f62599ebc9f445a'
+LOCK_PATH = os.path.join(tempfile.gettempdir(), 'sagemaker_test_git_lock')
 
 
 def test_git_support_with_pytorch(sagemaker_local_session):
@@ -36,14 +39,15 @@ def test_git_support_with_pytorch(sagemaker_local_session):
 
     pytorch.fit({'training': 'file://' + os.path.join(data_path, 'training')})
 
-    try:
-        predictor = pytorch.deploy(initial_instance_count=1, instance_type='local')
+    with lock.lock(LOCK_PATH):
+        try:
+            predictor = pytorch.deploy(initial_instance_count=1, instance_type='local')
 
-        data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32)
-        result = predictor.predict(data)
-        assert result is not None
-    finally:
-        predictor.delete_endpoint()
+            data = numpy.zeros(shape=(1, 1, 28, 28)).astype(numpy.float32)
+            result = predictor.predict(data)
+            assert result is not None
+        finally:
+            predictor.delete_endpoint()
 
 
 def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version):
@@ -65,11 +69,12 @@ def test_git_support_with_mxnet(sagemaker_local_session, mxnet_full_version):
     assert 'mnist.py' in files
     assert os.path.exists(mx.dependencies[0])
 
-    try:
-        predictor = mx.deploy(initial_instance_count=1, instance_type='local')
+    with lock.lock(LOCK_PATH):
+        try:
+            predictor = mx.deploy(initial_instance_count=1, instance_type='local')
 
-        data = numpy.zeros(shape=(1, 1, 28, 28))
-        result = predictor.predict(data)
-        assert result is not None
-    finally:
-        predictor.delete_endpoint()
+            data = numpy.zeros(shape=(1, 1, 28, 28))
+            result = predictor.predict(data)
+            assert result is not None
+        finally:
+            predictor.delete_endpoint()
diff --git a/tests/integ/test_local_mode.py b/tests/integ/test_local_mode.py
@@ -18,14 +18,18 @@
 import boto3
 import numpy
 import pytest
-import tests.integ.local_mode_utils as local_mode_utils
+import tempfile
+
+import tests.integ.lock as lock
 from tests.integ import DATA_DIR, PYTHON_VERSION
 from tests.integ.timeout import timeout
 
 from sagemaker.local import LocalSession, LocalSagemakerRuntimeClient, LocalSagemakerClient
 from sagemaker.mxnet import MXNet
 from sagemaker.tensorflow import TensorFlow
 
+# endpoint tests all use the same port, so we use this lock to prevent concurrent execution
+LOCK_PATH = os.path.join(tempfile.gettempdir(), 'sagemaker_test_local_mode_lock')
 DATA_PATH = os.path.join(DATA_DIR, 'iris', 'data')
 DEFAULT_REGION = 'us-west-2'
 
@@ -101,7 +105,7 @@ def test_tf_local_mode(tf_full_version, sagemaker_local_session):
         print('job succeeded: {}'.format(estimator.latest_training_job.name))
 
     endpoint_name = estimator.latest_training_job.name
-    with local_mode_utils.lock():
+    with lock.lock(LOCK_PATH):
         try:
             json_predictor = estimator.deploy(initial_instance_count=1,
                                               instance_type='local',
@@ -140,7 +144,7 @@ def test_tf_distributed_local_mode(sagemaker_local_session):
 
     endpoint_name = estimator.latest_training_job.name
 
-    with local_mode_utils.lock():
+    with lock.lock(LOCK_PATH):
         try:
             json_predictor = estimator.deploy(initial_instance_count=1,
                                               instance_type='local',
@@ -178,7 +182,7 @@ def test_tf_local_data(sagemaker_local_session):
         print('job succeeded: {}'.format(estimator.latest_training_job.name))
 
     endpoint_name = estimator.latest_training_job.name
-    with local_mode_utils.lock():
+    with lock.lock(LOCK_PATH):
         try:
             json_predictor = estimator.deploy(initial_instance_count=1,
                                               instance_type='local',
@@ -217,7 +221,7 @@ def test_tf_local_data_local_script():
         print('job succeeded: {}'.format(estimator.latest_training_job.name))
 
     endpoint_name = estimator.latest_training_job.name
-    with local_mode_utils.lock():
+    with lock.lock(LOCK_PATH):
         try:
             json_predictor = estimator.deploy(initial_instance_count=1,
                                               instance_type='local',
@@ -241,7 +245,7 @@ def test_local_mode_serving_from_s3_model(sagemaker_local_session, mxnet_model,
     s3_model.sagemaker_session = sagemaker_local_session
 
     predictor = None
-    with local_mode_utils.lock():
+    with lock.lock(LOCK_PATH):
         try:
             predictor = s3_model.deploy(initial_instance_count=1, instance_type='local')
             data = numpy.zeros(shape=(1, 1, 28, 28))
@@ -255,7 +259,7 @@ def test_local_mode_serving_from_s3_model(sagemaker_local_session, mxnet_model,
 def test_local_mode_serving_from_local_model(tmpdir, sagemaker_local_session, mxnet_model):
     predictor = None
 
-    with local_mode_utils.lock():
+    with lock.lock(LOCK_PATH):
         try:
             path = 'file://%s' % (str(tmpdir))
             model = mxnet_model(path)
@@ -285,7 +289,7 @@ def test_mxnet_local_mode(sagemaker_local_session, mxnet_full_version):
     mx.fit({'train': train_input, 'test': test_input})
     endpoint_name = mx.latest_training_job.name
 
-    with local_mode_utils.lock():
+    with lock.lock(LOCK_PATH):
         try:
             predictor = mx.deploy(1, 'local', endpoint_name=endpoint_name)
             data = numpy.zeros(shape=(1, 1, 28, 28))
@@ -310,7 +314,7 @@ def test_mxnet_local_data_local_script(mxnet_full_version):
     mx.fit({'train': train_input, 'test': test_input})
     endpoint_name = mx.latest_training_job.name
 
-    with local_mode_utils.lock():
+    with lock.lock(LOCK_PATH):
         try:
             predictor = mx.deploy(1, 'local', endpoint_name=endpoint_name)
             data = numpy.zeros(shape=(1, 1, 28, 28))
@@ -365,7 +369,7 @@ def test_local_transform_mxnet(sagemaker_local_session, tmpdir, mxnet_full_versi
     transformer = mx.transformer(1, 'local', assemble_with='Line', max_payload=1,
                                  strategy='SingleRecord', output_path=output_path)
 
-    with local_mode_utils.lock():
+    with lock.lock(LOCK_PATH):
         transformer.transform(transform_input, content_type='text/csv', split_type='Line')
         transformer.wait()
 
diff --git a/tests/integ/test_source_dirs.py b/tests/integ/test_source_dirs.py
@@ -16,7 +16,7 @@
 
 import pytest
 
-import tests.integ.local_mode_utils as local_mode_utils
+import tests.integ.lock as lock
 from tests.integ import DATA_DIR, PYTHON_VERSION
 
 from sagemaker.pytorch.estimator import PyTorch
@@ -37,7 +37,8 @@ def test_source_dirs(tmpdir, sagemaker_local_session):
                         sagemaker_session=sagemaker_local_session)
     estimator.fit()
 
-    with local_mode_utils.lock():
+    # endpoint tests all use the same port, so we use this lock to prevent concurrent execution
+    with lock.lock():
         try:
             predictor = estimator.deploy(initial_instance_count=1, instance_type='local')
             predict_response = predictor.predict([7])
diff --git a/tests/integ/vpc_test_utils.py b/tests/integ/vpc_test_utils.py
@@ -12,7 +12,13 @@
 # language governing permissions and limitations under the License.
 from __future__ import absolute_import
 
+import os
+import tempfile
+
+import tests.integ.lock as lock
+
 VPC_NAME = 'sagemaker-python-sdk-test-vpc'
+LOCK_PATH = os.path.join(tempfile.gettempdir(), 'sagemaker_test_vpc_lock')
 
 
 def _get_subnet_ids_by_name(ec2_client, name):
@@ -61,20 +67,24 @@ def _create_vpc_with_name(ec2_client, region, name):
                                            AvailabilityZone=(region + 'b'))['Subnet']['SubnetId']
     print('created subnet: {}'.format(subnet_id_b))
 
-    s3_service = [s for s in ec2_client.describe_vpc_endpoint_services()['ServiceNames'] if s.endswith('s3')][0]
+    s3_service = \
+        [s for s in ec2_client.describe_vpc_endpoint_services()['ServiceNames'] if
+         s.endswith('s3')][0]
     ec2_client.create_vpc_endpoint(VpcId=vpc_id, ServiceName=s3_service,
                                    RouteTableIds=[_get_route_table_id(ec2_client, vpc_id)])
     print('created s3 vpc endpoint')
 
-    security_group_id = ec2_client.create_security_group(VpcId=vpc_id, GroupName=name, Description=name)['GroupId']
+    security_group_id = \
+        ec2_client.create_security_group(VpcId=vpc_id, GroupName=name, Description=name)['GroupId']
     print('created security group: {}'.format(security_group_id))
 
     # multi-host vpc jobs require communication among hosts
     ec2_client.authorize_security_group_ingress(GroupId=security_group_id,
                                                 IpPermissions=[{'IpProtocol': 'tcp',
                                                                 'FromPort': 0,
                                                                 'ToPort': 65535,
-                                                                'UserIdGroupPairs': [{'GroupId': security_group_id}]}])
+                                                                'UserIdGroupPairs': [{
+                                                                    'GroupId': security_group_id}]}])
 
     ec2_client.create_tags(Resources=[vpc_id, subnet_id_a, subnet_id_b, security_group_id],
                            Tags=[{'Key': 'Name', 'Value': name}])
@@ -83,23 +93,28 @@ def _create_vpc_with_name(ec2_client, region, name):
 
 
 def get_or_create_vpc_resources(ec2_client, region, name=VPC_NAME):
-    if _vpc_exists(ec2_client, name):
-        print('using existing vpc: {}'.format(name))
-        return _get_subnet_ids_by_name(ec2_client, name), _get_security_id_by_name(ec2_client, name)
-    else:
-        print('creating new vpc: {}'.format(name))
-        return _create_vpc_with_name(ec2_client, region, name)
+    # use lock to prevent race condition when tests are running concurrently
+    with lock.lock(LOCK_PATH):
+        if _vpc_exists(ec2_client, name):
+            print('using existing vpc: {}'.format(name))
+            return _get_subnet_ids_by_name(ec2_client, name), _get_security_id_by_name(ec2_client,
+                                                                                       name)
+        else:
+            print('creating new vpc: {}'.format(name))
+            return _create_vpc_with_name(ec2_client, region, name)
 
 
 def setup_security_group_for_encryption(ec2_client, security_group_id):
     sg_desc = ec2_client.describe_security_groups(GroupIds=[security_group_id])
     ingress_perms = sg_desc['SecurityGroups'][0]['IpPermissions']
     if len(ingress_perms) == 1:
-        ec2_client.\
+        ec2_client. \
             authorize_security_group_ingress(GroupId=security_group_id,
                                              IpPermissions=[{'IpProtocol': '50',
-                                                             'UserIdGroupPairs': [{'GroupId': security_group_id}]},
+                                                             'UserIdGroupPairs': [
+                                                                 {'GroupId': security_group_id}]},
                                                             {'IpProtocol': 'udp',
                                                              'FromPort': 500,
                                                              'ToPort': 500,
-                                                             'UserIdGroupPairs': [{'GroupId': security_group_id}]}])
+                                                             'UserIdGroupPairs': [
+                                                                 {'GroupId': security_group_id}]}])
diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py
@@ -680,7 +680,7 @@ def test_git_support_bad_repo_url_format(sagemaker_session):
 
 
 def test_git_support_git_clone_fail(sagemaker_session):
-    git_config = {'repo': 'https://github.com/GaryTu1020/no-such-repo.git', 'branch': BRANCH}
+    git_config = {'repo': 'https://github.com/aws/no-such-repo.git', 'branch': BRANCH}
     fw = DummyFramework(entry_point='entry_point', git_config=git_config, role=ROLE,
                         sagemaker_session=sagemaker_session, train_instance_count=INSTANCE_COUNT,
                         train_instance_type=INSTANCE_TYPE, enable_cloudwatch_metrics=True)