|
| 1 | +# Future |
| 2 | +from __future__ import absolute_import, division, print_function, unicode_literals |
| 3 | + |
| 4 | +# Standard Library |
| 5 | +import os |
| 6 | +import subprocess |
| 7 | +import sys |
| 8 | +from argparse import REMAINDER, ArgumentParser |
| 9 | + |
| 10 | + |
| 11 | +r""" |
| 12 | +`torch.distributed.launch` is a module that spawns up multiple distributed |
| 13 | +training processes on each of the training nodes. |
| 14 | +
|
| 15 | +The utility can be used for single-node distributed training, in which one or |
| 16 | +more processes per node will be spawned. The utility can be used for either |
| 17 | +CPU training or GPU training. If the utility is used for GPU training, |
| 18 | +each distributed process will be operating on a single GPU. This can achieve |
| 19 | +well-improved single-node training performance. It can also be used in |
| 20 | +multi-node distributed training, by spawning up multiple processes on each node |
| 21 | +for well-improved multi-node distributed training performance as well. |
| 22 | +This will especially be benefitial for systems with multiple Infiniband |
| 23 | +interfaces that have direct-GPU support, since all of them can be utilized for |
| 24 | +aggregated communication bandwidth. |
| 25 | +
|
| 26 | +In both cases of single-node distributed training or multi-node distributed |
| 27 | +training, this utility will launch the given number of processes per node |
| 28 | +(``--nproc_per_node``). If used for GPU training, this number needs to be less |
| 29 | +or equal to the number of GPUs on the current system (``nproc_per_node``), |
| 30 | +and each process will be operating on a single GPU from *GPU 0 to |
| 31 | +GPU (nproc_per_node - 1)*. |
| 32 | +
|
| 33 | +**How to use this module:** |
| 34 | +
|
| 35 | +1. Single-Node multi-process distributed training |
| 36 | +
|
| 37 | +:: |
| 38 | +
|
| 39 | + >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE |
| 40 | + YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other |
| 41 | + arguments of your training script) |
| 42 | +
|
| 43 | +2. Multi-Node multi-process distributed training: (e.g. two nodes) |
| 44 | +
|
| 45 | +
|
| 46 | +Node 1: *(IP: 192.168.1.1, and has a free port: 1234)* |
| 47 | +
|
| 48 | +:: |
| 49 | +
|
| 50 | + >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE |
| 51 | + --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" |
| 52 | + --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 |
| 53 | + and all other arguments of your training script) |
| 54 | +
|
| 55 | +Node 2: |
| 56 | +
|
| 57 | +:: |
| 58 | +
|
| 59 | + >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE |
| 60 | + --nnodes=2 --node_rank=1 --master_addr="192.168.1.1" |
| 61 | + --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 |
| 62 | + and all other arguments of your training script) |
| 63 | +
|
| 64 | +3. To look up what optional arguments this module offers: |
| 65 | +
|
| 66 | +:: |
| 67 | +
|
| 68 | + >>> python -m torch.distributed.launch --help |
| 69 | +
|
| 70 | +
|
| 71 | +**Important Notices:** |
| 72 | +
|
| 73 | +1. This utility and multi-process distributed (single-node or |
| 74 | +multi-node) GPU training currently only achieves the best performance using |
| 75 | +the NCCL distributed backend. Thus NCCL backend is the recommended backend to |
| 76 | +use for GPU training. |
| 77 | +
|
| 78 | +2. In your training program, you must parse the command-line argument: |
| 79 | +``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module. |
| 80 | +If your training program uses GPUs, you should ensure that your code only |
| 81 | +runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by: |
| 82 | +
|
| 83 | +Parsing the local_rank argument |
| 84 | +
|
| 85 | +:: |
| 86 | +
|
| 87 | + >>> import argparse |
| 88 | + >>> parser = argparse.ArgumentParser() |
| 89 | + >>> parser.add_argument("--local_rank", type=int) |
| 90 | + >>> args = parser.parse_args() |
| 91 | +
|
| 92 | +Set your device to local rank using either |
| 93 | +
|
| 94 | +:: |
| 95 | +
|
| 96 | + >>> torch.cuda.set_device(arg.local_rank) # before your code runs |
| 97 | +
|
| 98 | +or |
| 99 | +
|
| 100 | +:: |
| 101 | +
|
| 102 | + >>> with torch.cuda.device(arg.local_rank): |
| 103 | + >>> # your code to run |
| 104 | +
|
| 105 | +3. In your training program, you are supposed to call the following function |
| 106 | +at the beginning to start the distributed backend. You need to make sure that |
| 107 | +the init_method uses ``env://``, which is the only supported ``init_method`` |
| 108 | +by this module. |
| 109 | +
|
| 110 | +:: |
| 111 | +
|
| 112 | + torch.distributed.init_process_group(backend='YOUR BACKEND', |
| 113 | + init_method='env://') |
| 114 | +
|
| 115 | +4. In your training program, you can either use regular distributed functions |
| 116 | +or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your |
| 117 | +training program uses GPUs for training and you would like to use |
| 118 | +:func:`torch.nn.parallel.DistributedDataParallel` module, |
| 119 | +here is how to configure it. |
| 120 | +
|
| 121 | +:: |
| 122 | +
|
| 123 | + model = torch.nn.parallel.DistributedDataParallel(model, |
| 124 | + device_ids=[arg.local_rank], |
| 125 | + output_device=arg.local_rank) |
| 126 | +
|
| 127 | +Please ensure that ``device_ids`` argument is set to be the only GPU device id |
| 128 | +that your code will be operating on. This is generally the local rank of the |
| 129 | +process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``, |
| 130 | +and ``output_device`` needs to be ``args.local_rank`` in order to use this |
| 131 | +utility |
| 132 | +
|
| 133 | +5. Another way to pass ``local_rank`` to the subprocesses via environment variable |
| 134 | +``LOCAL_RANK``. This behavior is enabled when you launch the script with |
| 135 | +``--use_env=True``. You must adjust the subprocess example above to replace |
| 136 | +``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher |
| 137 | +will not pass ``--local_rank`` when you specify this flag. |
| 138 | +
|
| 139 | +.. warning:: |
| 140 | +
|
| 141 | + ``local_rank`` is NOT globally unique: it is only unique per process |
| 142 | + on a machine. Thus, don't use it to decide if you should, e.g., |
| 143 | + write to a networked filesystem. See |
| 144 | + https://github.com/pytorch/pytorch/issues/12042 for an example of |
| 145 | + how things can go wrong if you don't do this correctly. |
| 146 | +
|
| 147 | +""" |
| 148 | + |
| 149 | + |
| 150 | +def parse_args(): |
| 151 | + """ |
| 152 | + Helper function parsing the command line options |
| 153 | + @retval ArgumentParser |
| 154 | + """ |
| 155 | + parser = ArgumentParser( |
| 156 | + description="PyTorch distributed training launch " |
| 157 | + "helper utility that will spawn up " |
| 158 | + "multiple distributed processes" |
| 159 | + ) |
| 160 | + |
| 161 | + # Optional arguments for the launch helper |
| 162 | + parser.add_argument( |
| 163 | + "--nnodes", |
| 164 | + type=int, |
| 165 | + default=1, |
| 166 | + help="The number of nodes to use for distributed " "training", |
| 167 | + ) |
| 168 | + parser.add_argument( |
| 169 | + "--node_rank", |
| 170 | + type=int, |
| 171 | + default=0, |
| 172 | + help="The rank of the node for multi-node distributed " "training", |
| 173 | + ) |
| 174 | + parser.add_argument( |
| 175 | + "--nproc_per_node", |
| 176 | + type=int, |
| 177 | + default=1, |
| 178 | + help="The number of processes to launch on each node, " |
| 179 | + "for GPU training, this is recommended to be set " |
| 180 | + "to the number of GPUs in your system so that " |
| 181 | + "each process can be bound to a single GPU.", |
| 182 | + ) |
| 183 | + parser.add_argument( |
| 184 | + "--master_addr", |
| 185 | + default="127.0.0.1", |
| 186 | + type=str, |
| 187 | + help="Master node (rank 0)'s address, should be either " |
| 188 | + "the IP address or the hostname of node 0, for " |
| 189 | + "single node multi-proc training, the " |
| 190 | + "--master_addr can simply be 127.0.0.1", |
| 191 | + ) |
| 192 | + parser.add_argument( |
| 193 | + "--master_port", |
| 194 | + default=29500, |
| 195 | + type=int, |
| 196 | + help="Master node (rank 0)'s free port that needs to " |
| 197 | + "be used for communication during distributed " |
| 198 | + "training", |
| 199 | + ) |
| 200 | + parser.add_argument( |
| 201 | + "--use_env", |
| 202 | + default=False, |
| 203 | + action="store_true", |
| 204 | + help="Use environment variable to pass " |
| 205 | + "'local rank'. For legacy reasons, the default value is False. " |
| 206 | + "If set to True, the script will not pass " |
| 207 | + "--local_rank as argument, and will instead set LOCAL_RANK.", |
| 208 | + ) |
| 209 | + parser.add_argument( |
| 210 | + "-m", |
| 211 | + "--module", |
| 212 | + default=False, |
| 213 | + action="store_true", |
| 214 | + help="Changes each process to interpret the launch script " |
| 215 | + "as a python module, executing with the same behavior as" |
| 216 | + "'python -m'.", |
| 217 | + ) |
| 218 | + parser.add_argument( |
| 219 | + "--no_python", |
| 220 | + default=False, |
| 221 | + action="store_true", |
| 222 | + help='Do not prepend the training script with "python" - just exec ' |
| 223 | + "it directly. Useful when the script is not a Python script.", |
| 224 | + ) |
| 225 | + |
| 226 | + # positional |
| 227 | + parser.add_argument( |
| 228 | + "--training_script", |
| 229 | + type=str, |
| 230 | + help="The full path to the single GPU training " |
| 231 | + "program/script to be launched in parallel, " |
| 232 | + "followed by all the arguments for the " |
| 233 | + "training script", |
| 234 | + ) |
| 235 | + |
| 236 | + # rest from the training program |
| 237 | + parser.add_argument("training_script_args", nargs=REMAINDER) |
| 238 | + return parser.parse_args() |
| 239 | + |
| 240 | + |
| 241 | +def main(): |
| 242 | + args = parse_args() |
| 243 | + |
| 244 | + # world size in terms of number of processes |
| 245 | + dist_world_size = args.nproc_per_node * args.nnodes |
| 246 | + |
| 247 | + # set PyTorch distributed related environmental variables |
| 248 | + current_env = os.environ.copy() |
| 249 | + current_env["MASTER_ADDR"] = args.master_addr |
| 250 | + current_env["MASTER_PORT"] = str(args.master_port) |
| 251 | + current_env["WORLD_SIZE"] = str(dist_world_size) |
| 252 | + |
| 253 | + processes = [] |
| 254 | + |
| 255 | + if "OMP_NUM_THREADS" not in os.environ and args.nproc_per_node > 1: |
| 256 | + current_env["OMP_NUM_THREADS"] = str(1) |
| 257 | + print( |
| 258 | + "*****************************************\n" |
| 259 | + "Setting OMP_NUM_THREADS environment variable for each process " |
| 260 | + "to be {} in default, to avoid your system being overloaded, " |
| 261 | + "please further tune the variable for optimal performance in " |
| 262 | + "your application as needed. \n" |
| 263 | + "*****************************************".format(current_env["OMP_NUM_THREADS"]) |
| 264 | + ) |
| 265 | + |
| 266 | + for local_rank in range(0, args.nproc_per_node): |
| 267 | + # each process's rank |
| 268 | + dist_rank = args.nproc_per_node * args.node_rank + local_rank |
| 269 | + current_env["RANK"] = str(dist_rank) |
| 270 | + current_env["LOCAL_RANK"] = str(local_rank) |
| 271 | + |
| 272 | + # spawn the processes |
| 273 | + with_python = not args.no_python |
| 274 | + cmd = [] |
| 275 | + if with_python: |
| 276 | + cmd = [sys.executable, "-u"] |
| 277 | + if args.module: |
| 278 | + cmd.append("-m") |
| 279 | + else: |
| 280 | + if not args.use_env: |
| 281 | + raise ValueError( |
| 282 | + "When using the '--no_python' flag, you must also set the '--use_env' flag." |
| 283 | + ) |
| 284 | + if args.module: |
| 285 | + raise ValueError( |
| 286 | + "Don't use both the '--no_python' flag and the '--module' flag at the same time." |
| 287 | + ) |
| 288 | + |
| 289 | + cmd.append(args.training_script) |
| 290 | + |
| 291 | + if not args.use_env: |
| 292 | + cmd.append("--local_rank={}".format(local_rank)) |
| 293 | + |
| 294 | + cmd.extend(args.training_script_args) |
| 295 | + |
| 296 | + process = subprocess.Popen(cmd, env=current_env) |
| 297 | + processes.append(process) |
| 298 | + |
| 299 | + for process in processes: |
| 300 | + process.wait() |
| 301 | + if process.returncode != 0: |
| 302 | + raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd) |
| 303 | + |
| 304 | + |
| 305 | +if __name__ == "__main__": |
| 306 | + main() |
0 commit comments