Add Torchrun script and enable distributed for that script

fduwjj · fduwjj · commit 95dd7e474deb · 2024-07-01T10:01:37.000-07:00
diff --git a/build/builder.py b/build/builder.py
@@ -142,7 +142,7 @@ def from_args(cls, args):  # -> BuilderArgs:
             device=args.device,
             precision=dtype,
             setup_caches=(args.output_dso_path or args.output_pte_path),
-            use_distributed=False,
+            use_distributed=args.distributed,
             is_chat_model=is_chat_model,
         )
 
@@ -347,6 +347,7 @@ def _load_model(builder_args, only_config=False):
     else:
         model = _load_model_default(builder_args)
 
+    # TODO: ongoing work to support loading model from checkpoint
     if builder_args.use_distributed:
         # init distributed
         world_size = int(os.environ["WORLD_SIZE"])
diff --git a/cli.py b/cli.py
@@ -56,6 +56,11 @@ def add_arguments_for_verb(parser, verb: str):
         action="store_true",
         help="Whether to start an interactive chat session",
     )
+    parser.add_argument(
+        "--distributed",
+        action="store_true",
+        help="Whether to enable distributed inference",
+    )
     parser.add_argument(
         "--gui",
         action="store_true",
diff --git a/config/model_config.py b/config/model_config.py
@@ -3,6 +3,7 @@
 
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+
 import json
 from dataclasses import dataclass, field
 from enum import Enum
diff --git a/distributed/run_dist_inference.sh b/distributed/run_dist_inference.sh
@@ -0,0 +1,31 @@
+#!/usr/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+# libUV is a scalable backend for TCPStore which is used in processGroup
+# rendezvous. This is the recommended backend for distributed training.
+export USE_LIBUV=1
+
+# use envs as local overrides for convenience
+# e.g.
+# LOG_RANK=0,1 NGPU=4 ./run_dist_inference.sh
+
+NGPU=${NGPU:-"8"}
+
+# TODO: We need to decide how to log for inference.
+# by default log just rank 0 output,
+LOG_RANK=${LOG_RANK:-0}
+
+overrides=""
+if [ $# -ne 0 ]; then
+    overrides="$*"
+fi
+
+torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
+--local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
+torchchat.py chat llama3 --distributed $overrides