Add comments and all notes for the code

fduwjj · fduwjj · commit f11df8176d7f · 2024-07-08T11:09:13.000-07:00
diff --git a/build/builder.py b/build/builder.py
@@ -9,10 +9,12 @@
 import time
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Optional, Tuple, Union
 from utils.measure_time import measure_time
 
 import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
 import torch._dynamo.config
 import torch._inductor.config
 
@@ -303,7 +305,6 @@ def _load_model_default(builder_args, only_config=False):
     model = _init_model_on_meta_device(builder_args)
     # checkpoint = torch.load(str(builder_args.checkpoint_path), mmap=True, weights_only=True)
     cps = []
-    print(f"Loading {builder_args.checkpoint_path} dir: {builder_args.checkpoint_dir}")
     if builder_args.checkpoint_dir is not None:
         # Load multiple checkpoint; ignore the single path.
         builder_args.checkpoint_path = None
@@ -344,7 +345,23 @@ def _load_model_default(builder_args, only_config=False):
     return model
 
 
-def _maybe_init_distributed(builder_args):
+def _maybe_init_distributed(
+    builder_args: BuilderArgs,
+) -> Tuple[Optional[DeviceMesh], Optional[ParallelDims]]:
+    """
+    Initialize distributed related setups if the user specified 
+    using distributed inference. If not, this is a no-op.
+
+    Args:
+        builder_args (:class:`BuilderArgs`):
+            Command args for model building.
+    Returns:
+        Tuple[Optional[DeviceMesh], Optional[ParallelDims]]: 
+            - The first element is an optional DeviceMesh object, 
+            which which describes the mesh topology of devices for the DTensor.
+            - The second element is an optional ParallelDims object, 
+            which represents the parallel dimensions configuration.
+    """
     if not builder_args.use_distributed:
         return None, None
     # TODO: ongoing work to support loading model from checkpoint
@@ -361,7 +378,30 @@ def _maybe_init_distributed(builder_args):
     return world_mesh, parallel_dims
 
 
-def _maybe_parellelize_model(model, builder_args, world_mesh, parallel_dims):
+def _maybe_parellelize_model(
+    model: nn.Module,
+    builder_args: BuilderArgs,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+) -> nn.Module:
+    """
+    We parallelize the module and load the distributed checkpoint to the model
+    if the user specifies using distributed inference. If not, this is a no-op.
+
+    Args:
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        builder_args (:class:`BuilderArgs`):
+            Command args for model building.
+        world_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+        parallel_dims (:class:`ParallelDims`):
+            Object which represents the parallel dimensions configuration.
+    Returns:
+        A :class:`nn.Module` object which is parallelized and checkpoint loaded
+        if the user specifies using distributed inference.
+    """
     if world_mesh is None:
         return model
     assert parallel_dims is not None
diff --git a/distributed/checkpoint.py b/distributed/checkpoint.py
@@ -5,10 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
+from typing import Any, Mapping
 
 import torch
+import torch.nn as nn
 import torch.distributed.checkpoint as dist_cp
 from torch.distributed._tensor import DTensor, Replicate, Shard
+from torch.distributed.device_mesh import DeviceMesh
 
 STATE_DICT_SHARDING_DIM_MAP = {
     "tok_embeddings.weight": 0,
@@ -19,35 +22,73 @@
     "feed_forward.w1.weight" : 0,
     "feed_forward.w2.weight" : 1,
     "feed_forward.w3.weight" : 0,
-
-    "attention_norm.weight" : -1, 
-    "ffn_norm.weight": -1,
-    "norm.weight" : -1, 
     "output.weight":0,
 }
 
 
-def _get_maybe_shard_for_weight(fqn_key):
+def _look_up_maybe_shard_for_weight(fqn: str) -> int:
+    """
+    Look up the sharding dim for the given fqn. If not found, return -1.
+
+    Args:
+        fqn (str): Fully qualified name of the parameter.
+    Returns:
+        int: sharding dim of the parameter.
+    """    
     for pattern, value in STATE_DICT_SHARDING_DIM_MAP.items():
-        if fqn_key.endswith(pattern):
+        if fqn.endswith(pattern):
             return value
     return -1
 
 
-def _build_distributed_state_dict(state_dict, tp_mesh):
+def _build_distributed_state_dict(
+    state_dict: Mapping[str, Any],
+    tp_mesh: DeviceMesh,
+) -> Mapping[str, DTensor]:
+    """
+    Covert the original LLaMa checkpoint from local disk to DTensor
+    based distributed state dict so that we can leverage distributed
+    checkpoint(DCP) for state_dict resharding and materialization.
+
+    Args:
+        state_dict (dict):
+            A dict of state_dict loaded from local disk.
+        tp_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh sub-topology
+            of devices for the Tensor Parallelsim.
+    Returns:
+        A dict of state_dict converted all to DTensor as values.
+    """
     dist_state_dict = {}
     for k, v in state_dict.items():
-        shard = _get_maybe_shard_for_weight(k)
+        shard = _look_up_maybe_shard_for_weight(k)
         if shard > 0:
             dist_state_dict[k] = DTensor.from_local(v, tp_mesh, [Shard(shard)], run_check=False)
         else:
             dist_state_dict[k] = DTensor.from_local(v, tp_mesh, [Replicate()], run_check=False)
     return dist_state_dict
             
 
-def _load_checkpoints_from_storage(builder_args, local_rank):
+def _load_checkpoints_from_storage(
+    builder_args, #TODO: Need to remove the circular dependency before specifying the type.
+    local_rank: int,
+)-> Mapping[str, Any]:
+    """
+    Load the original LLaMa checkpoint from local disk.
+
+    Args:
+        builder_args (:class:`BuilderArgs`):
+            Command args for model building.
+        local_rank (int):
+            Local rank for Tensor parallel.
+    Returns:
+        A dict of state_dict loaded from local disk.
+    """
     assert builder_args.checkpoint_dir is not None, "One needs to specify --checkpoint-path to load from storage"
-    #NOTE: We made a couple assumptions here: 
+    # NOTE: We made a couple assumptions here:
+    # The download.py in TorchChat changed the name of `consolidated.00.pth` to `model.pth`
+    # so that we have this hacky logic here. We need to revisit this logic once we can better
+    # support large model checkpointing downloading in TorchChat.
     cp_name = "model.pth" if local_rank == 0 else f"consolidated.0{local_rank}.pth"
     checkpoint_path = str(builder_args.checkpoint_path) if local_rank == 0 else os.path.join(builder_args.checkpoint_dir, cp_name)
     print(f"Loading {cp_name} on rank {local_rank}")
@@ -58,11 +99,32 @@ def _load_checkpoints_from_storage(builder_args, local_rank):
     )
 
 
-def load_checkpoints_to_model(model, builder_args, world_mesh):
+def load_checkpoints_to_model(
+    model: nn.Module,
+    builder_args, #TODO: Need to remove the circular dependency before specifying the type.
+    world_mesh: DeviceMesh,
+) -> nn.Module:
+    """
+    We parallelize the module and load the distributed checkpoint to the model.
+
+    Args:
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        builder_args (:class:`BuilderArgs`):
+            Command args for model building.
+        world_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+    Returns:
+        A :class:`nn.Module` object which is parallelized and checkpoint loaded.
+    """
     tp_mesh = world_mesh["tp"]
     local_rank = tp_mesh.get_local_rank()
     state_dict_storage = _load_checkpoints_from_storage(builder_args, local_rank)
     dist_state_dict = _build_distributed_state_dict(state_dict_storage, tp_mesh)  
+    # The format of the state_dict loaded from disk is different from 
+    # what we are going to use it for inference. As long as we can represent it 
+    # using DTensor, we can leverage DCP for the resharding and materialization.
     CHECKPOINT_DIR="converted_checkpoints"
     dist_cp.save(
         state_dict=dist_state_dict,