Skip to content

Commit 7ef6b99

Browse files
beniericpintaoz-aws
authored andcommitted
Add bugbash bootstrapping (#1598)
1 parent 758a311 commit 7ef6b99

File tree

7 files changed

+58
-12
lines changed

7 files changed

+58
-12
lines changed

src/sagemaker/modules/testing_notebooks/base_model_trainer.ipynb

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -326,7 +326,7 @@
326326
" channel_name=\"dataset\",\n",
327327
" data_source=training_input_path,\n",
328328
")\n",
329-
"model_trainer.train(input_data_config=[test_data])"
329+
"model_trainer.train(input_data_config=[test_data], wait=False)"
330330
]
331331
},
332332
{
@@ -417,7 +417,7 @@
417417
" channel_name=\"dataset\",\n",
418418
" data_source=training_input_path,\n",
419419
")\n",
420-
"model_trainer.train(input_data_config=[test_data])"
420+
"model_trainer.train(input_data_config=[test_data], wait=False)"
421421
]
422422
},
423423
{
@@ -487,10 +487,16 @@
487487
" ]\n",
488488
")\n",
489489
"\n",
490+
"env = {\n",
491+
" \"NCCL_P2P_DISABLE\": \"1\",\n",
492+
" \"NCCL_DEBUG\": \"WARN\",\n",
493+
"}\n",
494+
"\n",
490495
"model_trainer = ModelTrainer(\n",
491496
" training_image=hugging_face_image,\n",
492497
" compute=compute,\n",
493498
" hyperparameters=hyperparameters,\n",
499+
" environment=env,\n",
494500
" source_code=source_code,\n",
495501
" distributed_runner=mpi,\n",
496502
" base_job_name=f\"{alias}-distributed-case-3\",\n",
@@ -507,7 +513,7 @@
507513
" channel_name=\"dataset\",\n",
508514
" data_source=training_input_path,\n",
509515
")\n",
510-
"model_trainer.train(input_data_config=[test_data])"
516+
"model_trainer.train(input_data_config=[test_data], wait=False)"
511517
]
512518
},
513519
{
@@ -570,7 +576,7 @@
570576
" training_recipe=\"training/llama/hf_llama3_8b_seq8192_gpu\",\n",
571577
" training_image=training_image,\n",
572578
" recipe_overrides=recipe_overrides,\n",
573-
" compute=Compute(instance_type=\"ml.g5.48xlarge\"),\n",
579+
" compute=Compute(instance_type=\"ml.p4d.24xlarge\", keep_alive_period_in_seconds=3600),\n",
574580
" base_job_name=f\"{alias}-recipe-case-1\",\n",
575581
")"
576582
]
@@ -581,7 +587,7 @@
581587
"metadata": {},
582588
"outputs": [],
583589
"source": [
584-
"model_trainer.train()"
590+
"model_trainer.train(wait=False)"
585591
]
586592
},
587593
{
@@ -607,7 +613,7 @@
607613
"model_trainer = ModelTrainer.from_recipe(\n",
608614
" training_recipe=\"recipes/custom-recipe.yaml\",\n",
609615
" training_image=training_image,\n",
610-
" compute=Compute(instance_type=\"ml.g5.48xlarge\"),\n",
616+
" compute=Compute(instance_type=\"ml.p4d.24xlarge\", keep_alive_period_in_seconds=3600),\n",
611617
" base_job_name=f\"{alias}-recipe-case-2\",\n",
612618
")"
613619
]
@@ -618,7 +624,7 @@
618624
"metadata": {},
619625
"outputs": [],
620626
"source": [
621-
"model_trainer.train()"
627+
"model_trainer.train(wait=False)"
622628
]
623629
},
624630
{
@@ -678,11 +684,13 @@
678684
" compute=Compute(\n",
679685
" instance_type=\"ml.trn1.32xlarge\",\n",
680686
" instance_count=2,\n",
687+
" keep_alive_period_in_seconds=3600\n",
681688
" ),\n",
682689
" stopping_condition=StoppingCondition(\n",
683690
" max_runtime_in_seconds=3600\n",
684691
" ),\n",
685-
" environment=env\n",
692+
" environment=env,\n",
693+
" base_job_name=f\"{alias}-recipe-case-3\",\n",
686694
")"
687695
]
688696
},
@@ -699,6 +707,13 @@
699707
"\n",
700708
"model_trainer.train(input_data_config=[train], wait=False)"
701709
]
710+
},
711+
{
712+
"cell_type": "code",
713+
"execution_count": null,
714+
"metadata": {},
715+
"outputs": [],
716+
"source": []
702717
}
703718
],
704719
"metadata": {

src/sagemaker/modules/testing_notebooks/bootstrap.sh renamed to src/sagemaker/modules/testing_notebooks/bootstrap/bootstrap.sh

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#!/bin/bash
22

3+
echo "Installing Docker"
4+
35
sudo apt-get -y install ca-certificates curl gnupg
46

57
sudo install -m 0755 -d /etc/apt/keyrings
@@ -19,5 +21,27 @@ sudo apt-get install docker-ce-cli=$VERSION_STRING docker-compose-plugin -y
1921
# validate the Docker Client is able to access Docker Server at [unix:///docker/proxy.sock]
2022
docker version
2123

24+
echo "Installing Local SageMaker Tarball"
2225
pip install "pydantic>=2.0.0"
23-
pip install sagemaker-2.232.4.dev0.tar.gz
26+
pip install sagemaker-2.232.4.dev0.tar.gz
27+
28+
29+
echo "Setting Up Read-Only SSH Access"
30+
eval "$(ssh-agent -s)"
31+
32+
mkdir -p ~/.ssh/
33+
34+
cp /home/sagemaker-user/bootstrap/adapter_deploy_key /home/sagemaker-user/.ssh/adapter_deploy_key
35+
chmod 600 ~/.ssh/adapter_deploy_key
36+
37+
cp /home/sagemaker-user/bootstrap/launcher_deploy_key /home/sagemaker-user/.ssh/launcher_deploy_key
38+
chmod 600 ~/.ssh/launcher_deploy_key
39+
40+
cp /home/sagemaker-user/bootstrap/config /home/sagemaker-user/.ssh/config
41+
chmod 644 ~/.ssh/config
42+
43+
ssh-add ~/.ssh/adapter_deploy_key
44+
ssh-add ~/.ssh/launcher_deploy_key
45+
46+
47+
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
Host i.8713187.xyz-launcher
2+
Hostname github.com
3+
IdentityFile=~/.ssh/launcher_deploy_key
4+
5+
Host i.8713187.xyz-adapter
6+
Hostname github.com
7+
IdentityFile=~/.ssh/adapter_deploy_key
Binary file not shown.
Binary file not shown.

src/sagemaker/modules/train/container_drivers/mpi_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,6 @@ def get_mpirun_command(
206206
str(num_processes),
207207
"--allow-run-as-root",
208208
"--tag-output",
209-
"--oversubscribe",
210209
"-mca",
211210
"btl_tcp_if_include",
212211
network_interface_name,
@@ -245,6 +244,7 @@ def get_mpirun_command(
245244
instance_type = os.environ["SM_CURRENT_INSTANCE_TYPE"]
246245
# EFA settings
247246
if instance_type in SM_EFA_NCCL_INSTANCES:
247+
mpirun_command.extend(["-x", "FI_PROVIDER=efa"])
248248
# Use simple protocol to handle the out-of-order data delivery from EFA
249249
mpirun_command.extend(["-x", "NCCL_PROTO=simple"])
250250

src/sagemaker/modules/train/sm_recipes/training_recipes.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
2-
"adapter_repo": "[email protected]:aws/private-sagemaker-training-adapter-for-nemo-staging.git",
3-
"launcher_repo": "[email protected]:aws/private-sagemaker-training-launcher-staging.git",
2+
"adapter_repo": "[email protected]-adapter:benieric/private-sagemaker-hyperpod-training-adapter-for-nemo-staging.git",
3+
"launcher_repo": "[email protected]-launcher:benieric/private-sagemaker-hyperpod-recipes-staging.git",
44
"neuron_dist_repo": "https://github.com/aws-neuron/neuronx-distributed-training.git",
55
"gpu_image" : {
66
"framework": "pytorch-smp",

0 commit comments

Comments
 (0)