[Dist][Inference] Explore checkpoint loading

fduwjj · fduwjj · commit bc8f23990cc7 · 2024-07-08T09:54:18.000-07:00
diff --git a/build/builder.py b/build/builder.py
@@ -21,7 +21,7 @@
 
 from build.model import Transformer
 from build.utils import device_sync, is_cpu_device, is_cuda_or_cpu_device, name_to_dtype
-from distributed import parallelize_llama, ParallelDims, init_distributed
+from distributed import parallelize_llama, ParallelDims, init_distributed, load_checkpoints_to_model
 
 
 @dataclass
@@ -303,6 +303,7 @@ def _load_model_default(builder_args, only_config=False):
     model = _init_model_on_meta_device(builder_args)
     # checkpoint = torch.load(str(builder_args.checkpoint_path), mmap=True, weights_only=True)
     cps = []
+    print(f"Loading {builder_args.checkpoint_path} dir: {builder_args.checkpoint_dir}")
     if builder_args.checkpoint_dir is not None:
         # Load multiple checkpoint; ignore the single path.
         builder_args.checkpoint_path = None
@@ -343,27 +344,41 @@ def _load_model_default(builder_args, only_config=False):
     return model
 
 
+def _maybe_init_distributed(builder_args):
+    if not builder_args.use_distributed:
+        return None, None
+    # TODO: ongoing work to support loading model from checkpoint
+    # init distributed
+    world_size = int(os.environ["WORLD_SIZE"])
+    # TODO: To make tp, pp degree configurable
+    parallel_dims = ParallelDims(
+        tp=8,
+        pp=1,
+        world_size=world_size,
+    )
+    init_distributed()
+    world_mesh = parallel_dims.build_mesh(device_type="cuda")
+    return world_mesh, parallel_dims
+
+
+def _maybe_parellelize_model(model, builder_args, world_mesh, parallel_dims):
+    if world_mesh is None:
+        return model
+    assert parallel_dims is not None
+    print("Applying model parallel to model ...")
+    parallelize_llama(model, world_mesh, parallel_dims)
+    return load_checkpoints_to_model(model, builder_args, world_mesh)
+
+
 def _load_model(builder_args, only_config=False):
+    world_mesh, parallel_dims = _maybe_init_distributed(builder_args)
     if builder_args.gguf_path:
         model = _load_model_gguf(builder_args)
+    elif builder_args.use_distributed:
+        model = _init_model_on_meta_device(builder_args)
     else:
         model = _load_model_default(builder_args)
-
-    # TODO: ongoing work to support loading model from checkpoint
-    if builder_args.use_distributed:
-        # init distributed
-        world_size = int(os.environ["WORLD_SIZE"])
-        # TODO: To make tp, pp degree configurable
-        parallel_dims = ParallelDims(
-            tp=8,
-            pp=1,
-            world_size=world_size,
-        )
-        init_distributed()
-        world_mesh = parallel_dims.build_mesh(device_type="cuda")
-
-        print("Applying model parallel to model ...")
-        parallelize_llama(model, world_mesh, parallel_dims)
+    model = _maybe_parellelize_model(model, builder_args, world_mesh, parallel_dims)
 
     model = model.to(device=builder_args.device, dtype=builder_args.precision)
     return model.eval()
diff --git a/build/known_model_params/Meta-Llama-3-70B.json b/build/known_model_params/Meta-Llama-3-70B.json
@@ -0,0 +1 @@
+{"dim": 8192, "ffn_dim_multiplier": 1.3, "multiple_of": 4096, "n_heads": 64, "n_local_heads": 8, "n_layers": 80, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true}
diff --git a/cli.py b/cli.py
@@ -143,6 +143,12 @@ def add_arguments_for_verb(parser, verb: str):
         default="not_specified",
         help="Use the specified model checkpoint path",
     )
+    parser.add_argument(
+        "--checkpoint-dir",
+        type=Path,
+        default=None,
+        help="Use the specified model checkpoint directory",
+    )
     parser.add_argument(
         "--params-path",
         type=Path,
diff --git a/config/data/models.json b/config/data/models.json
@@ -34,6 +34,12 @@
         "distribution_path": "meta-llama/Meta-Llama-3-8B-Instruct",
         "transformer_params_key": "Meta-Llama-3-8B"
     },
+    "meta-llama/Meta-Llama-3-70B-Instruct": {
+        "aliases": ["llama3-70b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "meta-llama/Meta-Llama-3-70B-Instruct",
+        "transformer_params_key": "Meta-Llama-3-70B"
+    },
     "meta-llama/CodeLlama-7b-Python-hf": {
         "aliases": ["codellama", "codellama-7b"],
         "distribution_channel": "HuggingFaceSnapshot",
diff --git a/distributed/__init__.py b/distributed/__init__.py
@@ -7,3 +7,4 @@
 from distributed.parallelize_llama import parallelize_llama
 from distributed.parallel_config import ParallelDims
 from distributed.utils import init_distributed
+from distributed.checkpoint import load_checkpoints_to_model
diff --git a/distributed/checkpoint.py b/distributed/checkpoint.py
@@ -0,0 +1,78 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+
+import torch
+import torch.distributed.checkpoint as dist_cp
+from torch.distributed._tensor import DTensor, Replicate, Shard
+
+STATE_DICT_SHARDING_DIM_MAP = {
+    "tok_embeddings.weight": 0,
+    "attention.wq.weight" : 0,
+    "attention.wk.weight" : 0,
+    "attention.wv.weight" : 0,
+    "attention.wo.weight" : 1,
+    "feed_forward.w1.weight" : 0,
+    "feed_forward.w2.weight" : 1,
+    "feed_forward.w3.weight" : 0,
+
+    "attention_norm.weight" : -1, 
+    "ffn_norm.weight": -1,
+    "norm.weight" : -1, 
+    "output.weight":0,
+}
+
+
+def _get_maybe_shard_for_weight(fqn_key):
+    for pattern, value in STATE_DICT_SHARDING_DIM_MAP.items():
+        if fqn_key.endswith(pattern):
+            return value
+    return -1
+
+
+def _build_distributed_state_dict(state_dict, tp_mesh):
+    dist_state_dict = {}
+    for k, v in state_dict.items():
+        shard = _get_maybe_shard_for_weight(k)
+        if shard > 0:
+            dist_state_dict[k] = DTensor.from_local(v, tp_mesh, [Shard(shard)], run_check=False)
+        else:
+            dist_state_dict[k] = DTensor.from_local(v, tp_mesh, [Replicate()], run_check=False)
+    return dist_state_dict
+            
+
+def _load_checkpoints_from_storage(builder_args, local_rank):
+    assert builder_args.checkpoint_dir is not None, "One needs to specify --checkpoint-path to load from storage"
+    #NOTE: We made a couple assumptions here: 
+    cp_name = "model.pth" if local_rank == 0 else f"consolidated.0{local_rank}.pth"
+    checkpoint_path = str(builder_args.checkpoint_path) if local_rank == 0 else os.path.join(builder_args.checkpoint_dir, cp_name)
+    print(f"Loading {cp_name} on rank {local_rank}")
+    return torch.load(
+        checkpoint_path,
+        map_location=builder_args.device,
+        mmap=True,
+    )
+
+
+def load_checkpoints_to_model(model, builder_args, world_mesh):
+    tp_mesh = world_mesh["tp"]
+    local_rank = tp_mesh.get_local_rank()
+    state_dict_storage = _load_checkpoints_from_storage(builder_args, local_rank)
+    dist_state_dict = _build_distributed_state_dict(state_dict_storage, tp_mesh)  
+    CHECKPOINT_DIR="converted_checkpoints"
+    dist_cp.save(
+        state_dict=dist_state_dict,
+        storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
+    )
+
+    model_state_dict = model.state_dict()
+    dist_cp.load(
+        state_dict=model_state_dict,
+        storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
+    )
+    model.load_state_dict(model_state_dict, assign=True)
+    return model
diff --git a/distributed/run_dist_inference.sh b/distributed/run_dist_inference.sh
@@ -28,4 +28,4 @@ fi
 
 torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
 --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-torchchat.py chat llama3 --distributed $overrides
+torchchat.py chat llama3-70b --distributed $overrides --checkpoint-dir ~/.torchchat/model-cache/meta-llama/Meta-Llama-3-70B-Instruct/original

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"dim": 8192, "ffn_dim_multiplier": 1.3, "multiple_of": 4096, "n_heads": 64, "n_local_heads": 8, "n_layers": 80, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true}`