[Dist][Inference] Enable distributed checkpoint loading for large model (#883)

fduwjj · web-flow · commit ceb9a3a7a3d0 · 2024-07-11T12:44:31.000-07:00
* [Dist][Inference] Explore checkpoint loading
diff --git a/build/builder.py b/build/builder.py
@@ -9,9 +9,11 @@
 import time
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
 import torch._dynamo.config
 import torch._inductor.config
 
@@ -22,12 +24,14 @@
 
 from build.model import Transformer
 from build.utils import device_sync, is_cpu_device, is_cuda_or_cpu_device, name_to_dtype
+from distributed import parallelize_llama, ParallelDims, init_distributed, load_checkpoints_to_model
 
 
 @dataclass
 class BuilderArgs:
     checkpoint_path: Optional[Union[Path, str]] = None
     checkpoint_dir: Optional[Union[Path, str]] = None
+    dcp_dir: Optional[Union[Path, str]] = None
     params_path: Optional[Union[Path, str]] = None
     params_table: Optional[str] = None
     gguf_path: Optional[Union[Path, str]] = None
@@ -80,6 +84,8 @@ def from_args(cls, args):  # -> BuilderArgs:
         checkpoint_dir = None
         if hasattr(args, "checkpoint_dir"):
             checkpoint_dir = args.checkpoint_dir
+        if hasattr(args, "dcp_dir"):
+            dcp_dir = args.dcp_dir
 
         checkpoint_path = args.checkpoint_path
         params_table = args.params_table
@@ -133,6 +139,7 @@ def from_args(cls, args):  # -> BuilderArgs:
         return cls(
             checkpoint_dir=checkpoint_dir,
             checkpoint_path=checkpoint_path,
+            dcp_dir=dcp_dir,
             params_path=args.params_path,
             params_table=params_table,
             gguf_path=args.gguf_path,
@@ -344,27 +351,80 @@ def _load_model_default(builder_args, only_config=False):
     return model
 
 
+def _maybe_init_distributed(
+    builder_args: BuilderArgs,
+) -> Tuple[Optional[DeviceMesh], Optional[ParallelDims]]:
+    """
+    Initialize distributed related setups if the user specified 
+    using distributed inference. If not, this is a no-op.
+
+    Args:
+        builder_args (:class:`BuilderArgs`):
+            Command args for model building.
+    Returns:
+        Tuple[Optional[DeviceMesh], Optional[ParallelDims]]: 
+            - The first element is an optional DeviceMesh object, 
+            which which describes the mesh topology of devices for the DTensor.
+            - The second element is an optional ParallelDims object, 
+            which represents the parallel dimensions configuration.
+    """
+    if not builder_args.use_distributed:
+        return None, None
+    # TODO: ongoing work to support loading model from checkpoint
+    # init distributed
+    world_size = int(os.environ["WORLD_SIZE"])
+    # TODO: To make tp, pp degree configurable
+    parallel_dims = ParallelDims(
+        tp=8,
+        pp=1,
+        world_size=world_size,
+    )
+    init_distributed()
+    world_mesh = parallel_dims.build_mesh(device_type="cuda")
+    return world_mesh, parallel_dims
+
+
+def _maybe_parellelize_model(
+    model: nn.Module,
+    builder_args: BuilderArgs,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+) -> nn.Module:
+    """
+    We parallelize the module and load the distributed checkpoint to the model
+    if the user specifies using distributed inference. If not, this is a no-op.
+
+    Args:
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        builder_args (:class:`BuilderArgs`):
+            Command args for model building.
+        world_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+        parallel_dims (:class:`ParallelDims`):
+            Object which represents the parallel dimensions configuration.
+    Returns:
+        A :class:`nn.Module` object which is parallelized and checkpoint loaded
+        if the user specifies using distributed inference.
+    """
+    if world_mesh is None:
+        return model
+    assert parallel_dims is not None
+    print("Applying model parallel to model ...")
+    parallelize_llama(model, world_mesh, parallel_dims)
+    return load_checkpoints_to_model(model, builder_args, world_mesh)
+
+
 def _load_model(builder_args, only_config=False):
+    world_mesh, parallel_dims = _maybe_init_distributed(builder_args)
     if builder_args.gguf_path:
         model = _load_model_gguf(builder_args)
+    elif builder_args.use_distributed:
+        model = _init_model_on_meta_device(builder_args)
     else:
         model = _load_model_default(builder_args)
-
-    # TODO: ongoing work to support loading model from checkpoint
-    if builder_args.use_distributed:
-        # init distributed
-        world_size = int(os.environ["WORLD_SIZE"])
-        # TODO: To make tp, pp degree configurable
-        parallel_dims = ParallelDims(
-            tp=8,
-            pp=1,
-            world_size=world_size,
-        )
-        init_distributed()
-        world_mesh = parallel_dims.build_mesh(device_type="cuda")
-
-        print("Applying model parallel to model ...")
-        parallelize_llama(model, world_mesh, parallel_dims)
+    model = _maybe_parellelize_model(model, builder_args, world_mesh, parallel_dims)
 
     model = model.to(device=builder_args.device, dtype=builder_args.precision)
     return model.eval()
diff --git a/build/known_model_params/Meta-Llama-3-70B.json b/build/known_model_params/Meta-Llama-3-70B.json
@@ -0,0 +1 @@
+{"dim": 8192, "ffn_dim_multiplier": 1.3, "multiple_of": 4096, "n_heads": 64, "n_local_heads": 8, "n_layers": 80, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true}
diff --git a/cli.py b/cli.py
@@ -109,6 +109,12 @@ def add_arguments_for_verb(parser, verb: str) -> None:
         default="not_specified",
         help="Use the specified model checkpoint path",
     )
+    parser.add_argument(
+        "--dcp-dir",
+        type=Path,
+        default=None,
+        help="Use the specified model checkpoint directory",
+    )
     parser.add_argument(
         "--params-path",
         type=Path,
diff --git a/config/data/models.json b/config/data/models.json
@@ -34,6 +34,12 @@
         "distribution_path": "meta-llama/Meta-Llama-3-8B-Instruct",
         "transformer_params_key": "Meta-Llama-3-8B"
     },
+    "meta-llama/Meta-Llama-3-70B-Instruct": {
+        "aliases": ["llama3-70b"],
+        "distribution_channel": "HuggingFaceSnapshot",
+        "distribution_path": "meta-llama/Meta-Llama-3-70B-Instruct",
+        "transformer_params_key": "Meta-Llama-3-70B"
+    },
     "meta-llama/CodeLlama-7b-Python-hf": {
         "aliases": ["codellama", "codellama-7b"],
         "distribution_channel": "HuggingFaceSnapshot",
diff --git a/distributed/__init__.py b/distributed/__init__.py
@@ -7,3 +7,4 @@
 from distributed.parallelize_llama import parallelize_llama
 from distributed.parallel_config import ParallelDims
 from distributed.utils import init_distributed
+from distributed.checkpoint import load_checkpoints_to_model
diff --git a/distributed/checkpoint.py b/distributed/checkpoint.py
@@ -0,0 +1,140 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import Any, Mapping
+
+import torch
+import torch.nn as nn
+import torch.distributed.checkpoint as dist_cp
+from torch.distributed._tensor import DTensor, Replicate, Shard
+from torch.distributed.device_mesh import DeviceMesh
+
+STATE_DICT_SHARDING_DIM_MAP = {
+    "tok_embeddings.weight": 0,
+    "attention.wq.weight" : 0,
+    "attention.wk.weight" : 0,
+    "attention.wv.weight" : 0,
+    "attention.wo.weight" : 1,
+    "feed_forward.w1.weight" : 0,
+    "feed_forward.w2.weight" : 1,
+    "feed_forward.w3.weight" : 0,
+    "output.weight":0,
+}
+
+
+def _look_up_maybe_shard_for_weight(fqn: str) -> int:
+    """
+    Look up the sharding dim for the given fqn. If not found, return -1.
+
+    Args:
+        fqn (str): Fully qualified name of the parameter.
+    Returns:
+        int: sharding dim of the parameter.
+    """    
+    for pattern, value in STATE_DICT_SHARDING_DIM_MAP.items():
+        if fqn.endswith(pattern):
+            return value
+    return -1
+
+
+def _build_distributed_state_dict(
+    state_dict: Mapping[str, Any],
+    tp_mesh: DeviceMesh,
+) -> Mapping[str, DTensor]:
+    """
+    Covert the original LLaMa checkpoint from local disk to DTensor
+    based distributed state dict so that we can leverage distributed
+    checkpoint(DCP) for state_dict resharding and materialization.
+
+    Args:
+        state_dict (dict):
+            A dict of state_dict loaded from local disk.
+        tp_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh sub-topology
+            of devices for the Tensor Parallelsim.
+    Returns:
+        A dict of state_dict converted all to DTensor as values.
+    """
+    dist_state_dict = {}
+    for k, v in state_dict.items():
+        shard = _look_up_maybe_shard_for_weight(k)
+        if shard > 0:
+            dist_state_dict[k] = DTensor.from_local(v, tp_mesh, [Shard(shard)], run_check=False)
+        else:
+            dist_state_dict[k] = DTensor.from_local(v, tp_mesh, [Replicate()], run_check=False)
+    return dist_state_dict
+            
+
+def _load_checkpoints_from_storage(
+    builder_args, #TODO: Need to remove the circular dependency before specifying the type.
+    local_rank: int,
+)-> Mapping[str, Any]:
+    """
+    Load the original LLaMa checkpoint from local disk.
+
+    Args:
+        builder_args (:class:`BuilderArgs`):
+            Command args for model building.
+        local_rank (int):
+            Local rank for Tensor parallel.
+    Returns:
+        A dict of state_dict loaded from local disk.
+    """
+    assert builder_args.dcp_dir is not None, "One needs to specify --dcp-dir to load from storage"
+    # NOTE: We made a couple assumptions here:
+    # The download.py in TorchChat changed the name of `consolidated.00.pth` to `model.pth`
+    # so that we have this hacky logic here. We need to revisit this logic once we can better
+    # support large model checkpointing downloading in TorchChat.
+    cp_name = "model.pth" if local_rank == 0 else f"consolidated.0{local_rank}.pth"
+    checkpoint_path = str(builder_args.checkpoint_path) if local_rank == 0 else os.path.join(builder_args.dcp_dir, cp_name)
+    print(f"Loading {cp_name} on rank {local_rank}")
+    return torch.load(
+        checkpoint_path,
+        map_location=builder_args.device,
+        mmap=True,
+    )
+
+
+def load_checkpoints_to_model(
+    model: nn.Module,
+    builder_args, #TODO: Need to remove the circular dependency before specifying the type.
+    world_mesh: DeviceMesh,
+) -> nn.Module:
+    """
+    We parallelize the module and load the distributed checkpoint to the model.
+
+    Args:
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        builder_args (:class:`BuilderArgs`):
+            Command args for model building.
+        world_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+    Returns:
+        A :class:`nn.Module` object which is parallelized and checkpoint loaded.
+    """
+    tp_mesh = world_mesh["tp"]
+    local_rank = tp_mesh.get_local_rank()
+    state_dict_storage = _load_checkpoints_from_storage(builder_args, local_rank)
+    dist_state_dict = _build_distributed_state_dict(state_dict_storage, tp_mesh)  
+    # The format of the state_dict loaded from disk is different from 
+    # what we are going to use it for inference. As long as we can represent it 
+    # using DTensor, we can leverage DCP for the resharding and materialization.
+    CHECKPOINT_DIR = builder_args.dcp_dir / "converted_checkpoints"
+    dist_cp.save(
+        state_dict=dist_state_dict,
+        storage_writer=dist_cp.FileSystemWriter(CHECKPOINT_DIR),
+    )
+
+    model_state_dict = model.state_dict()
+    dist_cp.load(
+        state_dict=model_state_dict,
+        storage_reader=dist_cp.FileSystemReader(CHECKPOINT_DIR),
+    )
+    model.load_state_dict(model_state_dict, assign=True)
+    return model
diff --git a/distributed/run_dist_inference.sh b/distributed/run_dist_inference.sh
@@ -28,4 +28,4 @@ fi
 
 torchrun --nproc_per_node=${NGPU} --rdzv_backend c10d --rdzv_endpoint="localhost:0" \
 --local-ranks-filter ${LOG_RANK} --role rank --tee 3 \
-torchchat.py chat llama3 --distributed $overrides
+torchchat.py chat llama3-70b --distributed $overrides --dcp-dir ~/.torchchat/model-cache/meta-llama/Meta-Llama-3-70B-Instruct/original

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+{"dim": 8192, "ffn_dim_multiplier": 1.3, "multiple_of": 4096, "n_heads": 64, "n_local_heads": 8, "n_layers": 80, "rope_base": 500000.0, "vocab_size": 128256, "use_tiktoken": true}`