Add comments and further clean up the code

fduwjj · fduwjj · commit 910261f351ea · 2024-07-01T16:01:45.000-07:00
diff --git a/build/builder.py b/build/builder.py
@@ -21,7 +21,7 @@
 
 from build.model import Transformer
 from build.utils import device_sync, is_cpu_device, is_cuda_or_cpu_device, name_to_dtype
-from distributed import parallelize_llama, ParallelDims, ParallelConfig
+from distributed import parallelize_llama, ParallelDims
 
 
 @dataclass
@@ -351,10 +351,10 @@ def _load_model(builder_args, only_config=False):
     if builder_args.use_distributed:
         # init distributed
         world_size = int(os.environ["WORLD_SIZE"])
-        parallel_config = ParallelConfig()
+        # TODO: To make tp, pp degree configurable
         parallel_dims = ParallelDims(
-            tp=parallel_config.tp_degree,
-            pp=parallel_config.pp_degree,
+            tp=8,
+            pp=1,
             world_size=world_size,
         )
         device = torch.device(f"cuda:{int(os.environ['LOCAL_RANK'])}")
diff --git a/distributed/__init__.py b/distributed/__init__.py
@@ -5,4 +5,4 @@
 # LICENSE file in the root directory of this source tree.
 
 from distributed.parallelize_llama import parallelize_llama
-from distributed.parallel_config import ParallelConfig, ParallelDims
+from distributed.parallel_config import ParallelDims
diff --git a/distributed/parallel_config.py b/distributed/parallel_config.py
@@ -7,14 +7,6 @@
 from dataclasses import dataclass, field
 from torch.distributed.device_mesh import init_device_mesh
 
-@dataclass
-class ParallelConfig:
-    name: str = field(default="")
-    fp8_linear: str = field(default="")
-    tp_degree: int = field(default=1)
-    pp_degree: int = field(default=1)
-
-
 @dataclass
 class ParallelDims:
     tp: int
diff --git a/distributed/parallelize_llama.py b/distributed/parallelize_llama.py
@@ -13,39 +13,35 @@
     SequenceParallel,
 )
 
-from distributed.parallel_config import ParallelConfig
+import torch.nn as nn
+from distributed.parallel_config import ParallelDims
+from torch.distributed.device_mesh import DeviceMesh
 
 
-def get_tp_parallel_strategy(
-    config: ParallelConfig,
-) -> Tuple[RowwiseParallel, ColwiseParallel, PrepareModuleInput]:
-    """Get the parallel strategy for the transformer model.
-
-    This function handles the special case of using float8 with tensor parallelism.
-    """
-    if config.fp8_linear == "dynamic":
-        from float8_experimental.float8_tensor_parallel import (
-            Float8ColwiseParallel,
-            Float8RowwiseParallel,
-            PrepareFloat8ModuleInput,
-        )
-
-        return Float8RowwiseParallel, Float8ColwiseParallel, PrepareFloat8ModuleInput
-    return RowwiseParallel, ColwiseParallel, PrepareModuleInput
-
-
-def apply_tp(model, world_mesh, parallel_dims, config: ParallelConfig):
+def apply_tp(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+) -> nn.Module:
     """
-    Apply tensor parallelism.
+    Apply tensor parallelism to the given model. More details can be
+    found in https://pytorch.org/tutorials/intermediate/TP_tutorial.html.
+
+    NOTE: The way we apply tp is based on the assumption that the model is a LLaMA model.
+    One needs to change the ``parallelize_plan`` we pass in to the TP api if the model
+    is not a LLaMA model.
+
+
+    Args:
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        world_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+    Return:
+        A :class:`nn.Module` object tensor-parallelized.
     """
 
     tp_mesh = world_mesh["tp"]
-    (
-        row_parallel_strategy,
-        col_parallel_strategy,
-        prepare_module_input,
-    ) = get_tp_parallel_strategy(config)
-    loss_parallel = parallel_dims.loss_parallel_enabled
 
     # 1. Parallelize the first embedding and the last linear proj layer
     # 2. Parallelize the root norm layer over the sequence dim
@@ -58,10 +54,10 @@ def apply_tp(model, world_mesh, parallel_dims, config: ParallelConfig):
                 input_layouts=Replicate(),
                 output_layouts=Shard(1),
             ),
-            "output": col_parallel_strategy(
+            "output": ColwiseParallel(
                 input_layouts=Shard(1),
-                output_layouts=Shard(-1) if loss_parallel else Replicate(),
-                use_local_output=not loss_parallel,
+                output_layouts=Replicate(),
+                use_local_output=True,
             ),
             "norm": SequenceParallel(),
         },
@@ -74,18 +70,18 @@ def apply_tp(model, world_mesh, parallel_dims, config: ParallelConfig):
                 input_layouts=(Shard(1), None),
                 desired_input_layouts=(Replicate(), None),
             ),
-            "attention.wq": col_parallel_strategy(),
-            "attention.wk": col_parallel_strategy(),
-            "attention.wv": col_parallel_strategy(),
-            "attention.wo": row_parallel_strategy(output_layouts=Shard(1)),
+            "attention.wq": ColwiseParallel(),
+            "attention.wk": ColwiseParallel(),
+            "attention.wv": ColwiseParallel(),
+            "attention.wo": RowwiseParallel(output_layouts=Shard(1)),
             "attention_norm": SequenceParallel(),
             "feed_forward": prepare_module_input(
                 input_layouts=(Shard(1),),
                 desired_input_layouts=(Replicate(),),
             ),
-            "feed_forward.w1": col_parallel_strategy(),
-            "feed_forward.w2": row_parallel_strategy(output_layouts=Shard(1)),
-            "feed_forward.w3": col_parallel_strategy(),
+            "feed_forward.w1": ColwiseParallel(),
+            "feed_forward.w2": RowwiseParallel(output_layouts=Shard(1)),
+            "feed_forward.w3": ColwiseParallel(),
             "ffn_norm": SequenceParallel(),
         }
 
@@ -105,20 +101,31 @@ def apply_tp(model, world_mesh, parallel_dims, config: ParallelConfig):
     return model
 
 
-def parallelize_llama(model, world_mesh, parallel_dims, config: ParallelConfig):
+def parallelize_llama(
+    model: nn.Module,
+    world_mesh: DeviceMesh,
+    parallel_dims: ParallelDims,
+) -> nn.Module:
     """
     Apply tensor parallelism, activation checkpointing, torch.compile, and data
     parallelism to the model.
 
     NOTE: The passed-in model preferably should be on meta device. Otherwise,
     the model must fit on GPU or CPU memory.
+
+    Args:
+        module (:class:`nn.Module`):
+            Module to be parallelized.
+        world_mesh (:class:`DeviceMesh`):
+            Object which describes the mesh topology
+            of devices for the DTensor.
+        parallel_dims (:class:`ParallelDims`):
+            The object of the util class which contains the degree for each parallelism.
+    Return:
+        A :class:`nn.Module` object parallelized.
     """
 
     if parallel_dims.tp_enabled:
-        model = apply_tp(model, world_mesh, parallel_dims, job_config)
-
-    # only enable TP for now.
-    # if job_config.training.compile:
-    #     model = apply_compile(model, job_config)
+        model = apply_tp(model, world_mesh, parallel_dims)
 
     return model