Skip to content

Commit 240ca21

Browse files
sunil19myijiezh
authored andcommitted
Sagemaker public notebook consistant with the AWS Deepracer console code (#927)
* Sagemaker public notebook consistant with the AWS Deepracer console code 1. Fix Exception being Thrown By 4XX Errors 2. Separate User and System Errors for S3 Calls 3. Fix Out of Memory Issue in Training Worker 4. Reduced the time to upload sim traces to S3 to 1minute befor job exits 5. Remove sending training metrics to cloudwatch as reward graphs uses training metrics from the s3 bucket 6. Upload SIM_TRACE data to s3 bucket of the launched job for training and evaluation 7. Modified mameory backend system errors as inofrmation logs as SIMAPP is not halted by these errors 8. Don't allow the car to move backwards 7. Call os._exit when simapp encounters system or user error leading to exit * Changing the S3 bucket for the robomaker simapp
1 parent 59918d2 commit 240ca21

14 files changed

+1220
-372
lines changed
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import subprocess
2+
import boto3
3+
4+
SAGEMAKER_DOCKER_MARKOV_PATH = "/opt/amazon/markov"
5+
MARKOV_FOLDER = "./src/markov"
6+
7+
def run_cmd(cmd_args, change_working_directory="./", shell=False, executable=None):
8+
"""
9+
Function used to execute the shell commands
10+
11+
return (returncode, result)
12+
: returncode: int - This contains the 0/1 based on whether command
13+
: result: list - This is the output result by executing the command as a list
14+
15+
:param cmd_args: list - This is the list of commands to be appended in case
16+
of shell=True, else executed as pipes.
17+
:param change_working_directory: string - This is to execute the command in different
18+
directory
19+
:param shell: bool - This is to say if the command to be executed as shell or not
20+
"""
21+
cmd = " ".join(map(str, cmd_args))
22+
print(cmd)
23+
process = subprocess.Popen(
24+
cmd if shell else cmd_args,
25+
cwd=change_working_directory,
26+
shell=shell,
27+
executable=executable,
28+
stdout=subprocess.PIPE
29+
)
30+
result = list()
31+
for line in iter(process.stdout.readline, b""):
32+
result.append(line.decode("utf-8").rstrip())
33+
process.communicate()
34+
return process.returncode, result
35+
36+
37+
def get_sagemaker_docker(repository_short_name):
38+
"""
39+
If the sagemaker docker is already created it picks the most recently created docker
40+
with the Repository name you created earlier (Sagemaker docker). If not present,
41+
creates one sagemaker docker and returns its docker id.
42+
43+
return (docker_id)
44+
:docker_id: string - This is the sagemaker docker id.
45+
"""
46+
_, docker_ids = run_cmd([r'docker images {} | sed -n 2,2p'.format(repository_short_name)], shell=True)
47+
if docker_ids and docker_ids[0]:
48+
docker_id = [docker for docker in docker_ids[0].split(" ") if docker != ""]
49+
print("Sagemaker docker id : {}".format(docker_id[2]))
50+
return docker_id[2]
51+
raise Exception("SageMaker docker not found. Please check.")
52+
53+
def copy_to_sagemaker_container(sagemaker_docker_id, repository_short_name):
54+
"""
55+
This function will copy the contents to the sagemaker container. This is required because,
56+
the docker would alread be created with the original code and we need a docker container
57+
to copy the files from the src package to the docker container.
58+
"""
59+
_, docker_containers = run_cmd(["docker run -d -t {}".format(sagemaker_docker_id)], shell=True)
60+
#
61+
# Docker cp does not overwrite the files if modified. This is fixed in the
62+
# newer version. But the current version does not. Hence deleting the folder
63+
# and then copying the files to the container
64+
#
65+
66+
# Copy Markov package
67+
# Deleting markov folder in the sagemaker container
68+
run_cmd(["docker exec -d {0} rm -rf {1}".format(docker_containers[0], SAGEMAKER_DOCKER_MARKOV_PATH)],
69+
shell=True)
70+
# Copying markov folder to the sagemaker container
71+
run_cmd(["docker cp {0} {1}:{2}".format(MARKOV_FOLDER,
72+
docker_containers[0],
73+
SAGEMAKER_DOCKER_MARKOV_PATH)], shell=True)
74+
print("============ Copied Markov scripts to sagemaker docker ============ \n ")
75+
76+
docker_processes = run_cmd(["docker ps -l|sed -n 2,2p"], shell=True)
77+
docker_ps = [docker_process for docker_process in docker_processes[1][0].split(" ") if docker_process != ""][0]
78+
79+
# Committing all the changes to the docker
80+
run_cmd([r'docker commit {0} {1}'.format(docker_ps, repository_short_name)], shell=True)
81+
print("============ Commited all the changes to docker ============ \n ")
82+
83+
def get_custom_image_name(custom_image_name):
84+
session = boto3.Session()
85+
aws_account = session.client("sts").get_caller_identity()['Account']
86+
aws_region = session.region_name
87+
ecr_repo = '%s.dkr.ecr.%s.amazonaws.com' % (aws_account, aws_region)
88+
ecr_tag = '%s/%s' % (ecr_repo, custom_image_name)
89+
return ecr_tag

reinforcement_learning/rl_deepracer_robomaker_coach_gazebo/deepracer_rl.ipynb

Lines changed: 92 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,45 @@
4040
"## Prequisites"
4141
]
4242
},
43+
{
44+
"cell_type": "markdown",
45+
"metadata": {},
46+
"source": [
47+
"### Use below commands only if you want to make changes to the simulation application (Robomaker code changes)"
48+
]
49+
},
50+
{
51+
"cell_type": "code",
52+
"execution_count": null,
53+
"metadata": {},
54+
"outputs": [],
55+
"source": [
56+
"# #\n",
57+
"# # Run these commands if you want to modify the simapp\n",
58+
"# #\n",
59+
"# # Clean the build directory if present\n",
60+
"# !python3 sim_app_bundler.py --clean\n",
61+
"\n",
62+
"# # Download Robomaker simApp from the deepracer public s3 bucket\n",
63+
"# simulation_application_bundle_location = \"s3://deepracer-managed-resources-us-east-1/deepracer-simapp-notebook.tar.gz\"\n",
64+
"# !aws s3 cp {simulation_application_bundle_location} ./\n",
65+
"\n",
66+
"# # Untar the simapp bundle\n",
67+
"# !python3 sim_app_bundler.py --untar ./deepracer-simapp.tar.gz\n",
68+
"\n",
69+
"# # Now modify the simapp from build directory and run this command.\n",
70+
"\n",
71+
"# # Most of the simapp files can be found here (Robomaker changes)\n",
72+
"# # bundle/opt/install/sagemaker_rl_agent/lib/python3.5/site-packages/\n",
73+
"# # bundle/opt/install/deepracer_simulation_environment/share/deepracer_simulation_environment/\n",
74+
"# # bundle/opt/install/deepracer_simulation_environment/lib/deepracer_simulation_environment/\n",
75+
"\n",
76+
"# # Copying the notebook src/markov changes to the simapp (For sagemaker container)\n",
77+
"# !rsync -av ./src/markov/ ./build/simapp/bundle/opt/install/sagemaker_rl_agent/lib/python3.5/site-packages/markov\n",
78+
"\n",
79+
"# !python3 sim_app_bundler.py --tar"
80+
]
81+
},
4382
{
4483
"cell_type": "markdown",
4584
"metadata": {},
@@ -246,14 +285,40 @@
246285
"outputs": [],
247286
"source": [
248287
"%%time\n",
288+
"from copy_to_sagemaker_container import get_sagemaker_docker, copy_to_sagemaker_container, get_custom_image_name\n",
249289
"cpu_or_gpu = 'gpu' if instance_type.startswith('ml.p') else 'cpu'\n",
250290
"repository_short_name = \"sagemaker-docker-%s\" % cpu_or_gpu\n",
251-
"docker_build_args = {\n",
252-
" 'CPU_OR_GPU': cpu_or_gpu, \n",
253-
" 'AWS_REGION': boto3.Session().region_name,\n",
254-
"}\n",
255-
"custom_image_name = build_and_push_docker_image(repository_short_name, build_args=docker_build_args)\n",
256-
"print(\"Using ECR image %s\" % custom_image_name)"
291+
"custom_image_name = get_custom_image_name(repository_short_name)\n",
292+
"try:\n",
293+
" print(\"Copying files from your notebook to existing sagemaker container\")\n",
294+
" sagemaker_docker_id = get_sagemaker_docker(repository_short_name)\n",
295+
" copy_to_sagemaker_container(sagemaker_docker_id, repository_short_name)\n",
296+
"except Exception as e:\n",
297+
" print(\"Creating sagemaker container\")\n",
298+
" docker_build_args = {\n",
299+
" 'CPU_OR_GPU': cpu_or_gpu, \n",
300+
" 'AWS_REGION': boto3.Session().region_name,\n",
301+
" }\n",
302+
" custom_image_name = build_and_push_docker_image(repository_short_name, build_args=docker_build_args)\n",
303+
" print(\"Using ECR image %s\" % custom_image_name)"
304+
]
305+
},
306+
{
307+
"cell_type": "markdown",
308+
"metadata": {},
309+
"source": [
310+
"### Clean the docker images\n",
311+
"Remove this only when you want to completely remove the docker or clean up the space of the sagemaker instance"
312+
]
313+
},
314+
{
315+
"cell_type": "code",
316+
"execution_count": null,
317+
"metadata": {},
318+
"outputs": [],
319+
"source": [
320+
"# !docker rm -f $(docker ps -a -q);\n",
321+
"# !docker rmi -f $(docker images -q);"
257322
]
258323
},
259324
{
@@ -280,9 +345,11 @@
280345
"# If this is not present. Use the default VPC connnection\n",
281346
"#\n",
282347
"deepracer_security_groups = [group[\"GroupId\"] for group in ec2.describe_security_groups()['SecurityGroups']\\\n",
283-
" if group['GroupName'].startswith(\"deepracer-vpc\")]\n",
348+
" if group['GroupName'].startswith(\"aws-deepracer-\")]\n",
349+
"\n",
350+
"# deepracer_security_groups = False\n",
284351
"if(deepracer_security_groups):\n",
285-
" print(\"Using the DeepRacer VPC stacks\")\n",
352+
" print(\"Using the DeepRacer VPC stacks. This will be created if you run one training job from console.\")\n",
286353
" deepracer_vpc = [vpc['VpcId'] for vpc in ec2.describe_vpcs()['Vpcs'] \\\n",
287354
" if \"Tags\" in vpc for val in vpc['Tags'] \\\n",
288355
" if val['Value'] == 'deepracer-vpc'][0]\n",
@@ -351,7 +418,7 @@
351418
" display(Markdown(generate_help_for_s3_endpoint_permissions(role)))\n",
352419
" raise e\n",
353420
" else:\n",
354-
" display(Markdown(create_s3_endpoint_manually(aws_region, default_vpc)))\n",
421+
" display(Markdown(create_s3_endpoint_manually(aws_region, deepracer_vpc)))\n",
355422
" raise e\n",
356423
"\n",
357424
"if CREATE_ROUTE_TABLE:\n",
@@ -579,18 +646,23 @@
579646
"metadata": {},
580647
"outputs": [],
581648
"source": [
582-
"# Download Robomaker simApp for the deepracer public s3 bucket\n",
583-
"simulation_application_bundle_location = \"s3://deepracer-managed-resources-us-east-1/deepracer-simapp.tar.gz\"\n",
584-
"!aws s3 cp {simulation_application_bundle_location} ./\n",
649+
"if not os.path.exists('./build/output.tar.gz'):\n",
650+
" print(\"Using the latest simapp from public s3 bucket\")\n",
651+
" # Download Robomaker simApp for the deepracer public s3 bucket\n",
652+
" simulation_application_bundle_location = \"s3://deepracer-managed-resources-us-east-1/deepracer-simapp-notebook.tar.gz\"\n",
653+
" !aws s3 cp {simulation_application_bundle_location} ./\n",
585654
"\n",
586-
"# Remove if the Robomaker sim-app is present in s3 bucket\n",
587-
"!aws s3 rm s3://{s3_bucket}/{robomaker_s3_key}\n",
655+
" # Remove if the Robomaker sim-app is present in s3 bucket\n",
656+
" !aws s3 rm s3://{s3_bucket}/{robomaker_s3_key}\n",
588657
"\n",
589-
"# Uploading the Robomaker SimApp to your S3 bucket\n",
590-
"!aws s3 cp ./deepracer-simapp.tar.gz s3://{s3_bucket}/{robomaker_s3_key}\n",
591-
" \n",
592-
"# Cleanup the locally downloaded version of SimApp\n",
593-
"!rm deepracer-simapp.tar.gz\n"
658+
" # Uploading the Robomaker SimApp to your S3 bucket\n",
659+
" !aws s3 cp ./deepracer-simapp-notebook.tar.gz s3://{s3_bucket}/{robomaker_s3_key}\n",
660+
"\n",
661+
" # Cleanup the locally downloaded version of SimApp\n",
662+
" !rm deepracer-simapp-notebook.tar.gz\n",
663+
"else:\n",
664+
" print(\"Using the simapp from build directory\")\n",
665+
" !aws s3 cp ./build/output.tar.gz s3://{s3_bucket}/{robomaker_s3_key}"
594666
]
595667
},
596668
{
@@ -664,10 +736,9 @@
664736
" \"securityGroups\": deepracer_security_groups,\n",
665737
" \"assignPublicIp\": True}\n",
666738
"\n",
667-
"client_request_token = strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
668-
"\n",
669739
"responses = []\n",
670740
"for job_no in range(num_simulation_workers):\n",
741+
" client_request_token = strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
671742
" response = robomaker.create_simulation_job(iamRole=sagemaker_role,\n",
672743
" clientRequestToken=client_request_token,\n",
673744
" maxJobDurationInSeconds=job_duration_in_seconds,\n",
@@ -908,23 +979,6 @@
908979
"# model_output = \"s3://{}/{}\".format(s3_bucket, s3_bucket)\n",
909980
"# !aws s3 rm --recursive {model_output}"
910981
]
911-
},
912-
{
913-
"cell_type": "markdown",
914-
"metadata": {},
915-
"source": [
916-
"### Clean the docker images\n",
917-
"Remove this only when you want to completely remove the docker or clean up the space of the sagemaker instance"
918-
]
919-
},
920-
{
921-
"cell_type": "code",
922-
"execution_count": null,
923-
"metadata": {},
924-
"outputs": [],
925-
"source": [
926-
"# !docker rmi -f $(docker images -q)"
927-
]
928982
}
929983
],
930984
"metadata": {

0 commit comments

Comments
 (0)