aws
diff --git a/‎sagemaker-debugger/pytorch_profiling/entry_point/distributed_launch.py
Lines changed: 306 additions & 0 deletions b/‎sagemaker-debugger/pytorch_profiling/entry_point/distributed_launch.py
Lines changed: 306 additions & 0 deletions
diff --git a/‎sagemaker-debugger/pytorch_profiling/entry_point/horovod_test_launcher.py
Lines changed: 63 additions & 0 deletions b/‎sagemaker-debugger/pytorch_profiling/entry_point/horovod_test_launcher.py
Lines changed: 63 additions & 0 deletions
@@ -0,0 +1,306 @@
+# Future
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+# Standard Library
+import os
+import subprocess
+import sys
+from argparse import REMAINDER, ArgumentParser
+
+
+r"""
+`torch.distributed.launch` is a module that spawns up multiple distributed
+training processes on each of the training nodes.
+
+The utility can be used for single-node distributed training, in which one or
+more processes per node will be spawned. The utility can be used for either
+CPU training or GPU training. If the utility is used for GPU training,
+each distributed process will be operating on a single GPU. This can achieve
+well-improved single-node training performance. It can also be used in
+multi-node distributed training, by spawning up multiple processes on each node
+for well-improved multi-node distributed training performance as well.
+This will especially be benefitial for systems with multiple Infiniband
+interfaces that have direct-GPU support, since all of them can be utilized for
+aggregated communication bandwidth.
+
+In both cases of single-node distributed training or multi-node distributed
+training, this utility will launch the given number of processes per node
+(``--nproc_per_node``). If used for GPU training, this number needs to be less
+or equal to the number of GPUs on the current system (``nproc_per_node``),
+and each process will be operating on a single GPU from *GPU 0 to
+GPU (nproc_per_node - 1)*.
+
+**How to use this module:**
+
+1. Single-Node multi-process distributed training
+
+::
+
+    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+               YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
+               arguments of your training script)
+
+2. Multi-Node multi-process distributed training: (e.g. two nodes)
+
+
+Node 1: *(IP: 192.168.1.1, and has a free port: 1234)*
+
+::
+
+    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+               --nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
+               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+               and all other arguments of your training script)
+
+Node 2:
+
+::
+
+    >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
+               --nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
+               --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
+               and all other arguments of your training script)
+
+3. To look up what optional arguments this module offers:
+
+::
+
+    >>> python -m torch.distributed.launch --help
+
+
+**Important Notices:**
+
+1. This utility and multi-process distributed (single-node or
+multi-node) GPU training currently only achieves the best performance using
+the NCCL distributed backend. Thus NCCL backend is the recommended backend to
+use for GPU training.
+
+2. In your training program, you must parse the command-line argument:
+``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
+If your training program uses GPUs, you should ensure that your code only
+runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:
+
+Parsing the local_rank argument
+
+::
+
+    >>> import argparse
+    >>> parser = argparse.ArgumentParser()
+    >>> parser.add_argument("--local_rank", type=int)
+    >>> args = parser.parse_args()
+
+Set your device to local rank using either
+
+::
+
+    >>> torch.cuda.set_device(arg.local_rank)  # before your code runs
+
+or
+
+::
+
+    >>> with torch.cuda.device(arg.local_rank):
+    >>>    # your code to run
+
+3. In your training program, you are supposed to call the following function
+at the beginning to start the distributed backend. You need to make sure that
+the init_method uses ``env://``, which is the only supported ``init_method``
+by this module.
+
+::
+
+    torch.distributed.init_process_group(backend='YOUR BACKEND',
+                                         init_method='env://')
+
+4. In your training program, you can either use regular distributed functions
+or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your
+training program uses GPUs for training and you would like to use
+:func:`torch.nn.parallel.DistributedDataParallel` module,
+here is how to configure it.
+
+::
+
+    model = torch.nn.parallel.DistributedDataParallel(model,
+                                                      device_ids=[arg.local_rank],
+                                                      output_device=arg.local_rank)
+
+Please ensure that ``device_ids`` argument is set to be the only GPU device id
+that your code will be operating on. This is generally the local rank of the
+process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``,
+and ``output_device`` needs to be ``args.local_rank`` in order to use this
+utility
+
+5. Another way to pass ``local_rank`` to the subprocesses via environment variable
+``LOCAL_RANK``. This behavior is enabled when you launch the script with
+``--use_env=True``. You must adjust the subprocess example above to replace
+``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher
+will not pass ``--local_rank`` when you specify this flag.
+
+.. warning::
+
+    ``local_rank`` is NOT globally unique: it is only unique per process
+    on a machine.  Thus, don't use it to decide if you should, e.g.,
+    write to a networked filesystem.  See
+    https://github.com/pytorch/pytorch/issues/12042 for an example of
+    how things can go wrong if you don't do this correctly.
+
+"""
+
+
+def parse_args():
+    """
+    Helper function parsing the command line options
+    @retval ArgumentParser
+    """
+    parser = ArgumentParser(
+        description="PyTorch distributed training launch "
+        "helper utility that will spawn up "
+        "multiple distributed processes"
+    )
+
+    # Optional arguments for the launch helper
+    parser.add_argument(
+        "--nnodes",
+        type=int,
+        default=1,
+        help="The number of nodes to use for distributed " "training",
+    )
+    parser.add_argument(
+        "--node_rank",
+        type=int,
+        default=0,
+        help="The rank of the node for multi-node distributed " "training",
+    )
+    parser.add_argument(
+        "--nproc_per_node",
+        type=int,
+        default=1,
+        help="The number of processes to launch on each node, "
+        "for GPU training, this is recommended to be set "
+        "to the number of GPUs in your system so that "
+        "each process can be bound to a single GPU.",
+    )
+    parser.add_argument(
+        "--master_addr",
+        default="127.0.0.1",
+        type=str,
+        help="Master node (rank 0)'s address, should be either "
+        "the IP address or the hostname of node 0, for "
+        "single node multi-proc training, the "
+        "--master_addr can simply be 127.0.0.1",
+    )
+    parser.add_argument(
+        "--master_port",
+        default=29500,
+        type=int,
+        help="Master node (rank 0)'s free port that needs to "
+        "be used for communication during distributed "
+        "training",
+    )
+    parser.add_argument(
+        "--use_env",
+        default=False,
+        action="store_true",
+        help="Use environment variable to pass "
+        "'local rank'. For legacy reasons, the default value is False. "
+        "If set to True, the script will not pass "
+        "--local_rank as argument, and will instead set LOCAL_RANK.",
+    )
+    parser.add_argument(
+        "-m",
+        "--module",
+        default=False,
+        action="store_true",
+        help="Changes each process to interpret the launch script "
+        "as a python module, executing with the same behavior as"
+        "'python -m'.",
+    )
+    parser.add_argument(
+        "--no_python",
+        default=False,
+        action="store_true",
+        help='Do not prepend the training script with "python" - just exec '
+        "it directly. Useful when the script is not a Python script.",
+    )
+
+    # positional
+    parser.add_argument(
+        "--training_script",
+        type=str,
+        help="The full path to the single GPU training "
+        "program/script to be launched in parallel, "
+        "followed by all the arguments for the "
+        "training script",
+    )
+
+    # rest from the training program
+    parser.add_argument("training_script_args", nargs=REMAINDER)
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    # world size in terms of number of processes
+    dist_world_size = args.nproc_per_node * args.nnodes
+
+    # set PyTorch distributed related environmental variables
+    current_env = os.environ.copy()
+    current_env["MASTER_ADDR"] = args.master_addr
+    current_env["MASTER_PORT"] = str(args.master_port)
+    current_env["WORLD_SIZE"] = str(dist_world_size)
+
+    processes = []
+
+    if "OMP_NUM_THREADS" not in os.environ and args.nproc_per_node > 1:
+        current_env["OMP_NUM_THREADS"] = str(1)
+        print(
+            "*****************************************\n"
+            "Setting OMP_NUM_THREADS environment variable for each process "
+            "to be {} in default, to avoid your system being overloaded, "
+            "please further tune the variable for optimal performance in "
+            "your application as needed. \n"
+            "*****************************************".format(current_env["OMP_NUM_THREADS"])
+        )
+
+    for local_rank in range(0, args.nproc_per_node):
+        # each process's rank
+        dist_rank = args.nproc_per_node * args.node_rank + local_rank
+        current_env["RANK"] = str(dist_rank)
+        current_env["LOCAL_RANK"] = str(local_rank)
+
+        # spawn the processes
+        with_python = not args.no_python
+        cmd = []
+        if with_python:
+            cmd = [sys.executable, "-u"]
+            if args.module:
+                cmd.append("-m")
+        else:
+            if not args.use_env:
+                raise ValueError(
+                    "When using the '--no_python' flag, you must also set the '--use_env' flag."
+                )
+            if args.module:
+                raise ValueError(
+                    "Don't use both the '--no_python' flag and the '--module' flag at the same time."
+                )
+
+        cmd.append(args.training_script)
+
+        if not args.use_env:
+            cmd.append("--local_rank={}".format(local_rank))
+
+        cmd.extend(args.training_script_args)
+
+        process = subprocess.Popen(cmd, env=current_env)
+        processes.append(process)
+
+    for process in processes:
+        process.wait()
+        if process.returncode != 0:
+            raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,63 @@
+# Standard Library
+import argparse
+import os
+import subprocess
+import sys
+from distutils.util import strtobool
+
+# Third Party
+from torch.cuda import device_count
+
+HOROVOD_PYTORCH_TEST_MNIST_SCRIPT = "./horovod_mnist.py"
+
+
+HOROVOD_MNIST_SCRIPT_NAME = "horovod_mnist.py"
+
+
+def launch_horovod_job(script_file_path, script_args, num_workers, smprofile_path, mode):
+    command = ["mpirun", "-np", str(num_workers)] + [sys.executable, script_file_path] + script_args
+    env_dict = os.environ.copy()
+    env_dict["HOROVOD_TIMELINE"] = f"{smprofile_path}"
+    if mode == "cpu":
+        env_dict["CUDA_VISIBLE_DEVICES"] = "-1"
+    subprocess.check_call(command, env=env_dict)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Launch horovod test")
+    parser.add_argument("--script", type=str, default=HOROVOD_PYTORCH_TEST_MNIST_SCRIPT)
+    parser.add_argument("--batch_size", type=int, default=128)
+    parser.add_argument("--epoch", type=int, default=5)
+    parser.add_argument("--gpu", type=strtobool, default=1)
+    parser.add_argument("--profile_path", type=str, default="./hvd_timeline.json")
+    parser.add_argument("--model", type=str, default="resnet50")
+    opt = parser.parse_args()
+
+    if opt.gpu == 1:
+        mode = "gpu"
+    else:
+        mode = "cpu"
+    num_workers = 1 if bool(device_count()) is False else device_count()
+    print(f"Number of workers = {num_workers}")
+    mode_args = []
+    if mode == "cpu":
+        mode_args += ["--use_only_cpu", "true"]
+    mode_args += [
+        "--epochs",
+        str(opt.epoch),
+        "--batch_size",
+        str(opt.batch_size),
+        "--model",
+        str(opt.model),
+    ]
+    launch_horovod_job(
+        script_file_path=opt.script,
+        script_args=mode_args,
+        num_workers=num_workers,
+        smprofile_path=opt.profile_path,
+        mode=mode,
+    )
+
+
+if __name__ == "__main__":
+    main()