[Dist][Inference] U-haul TP and distribute utils code to TorchChat

fduwjj · fduwjj · commit 0c3e7bf297e0 · 2024-07-01T12:13:47.000-07:00
diff --git a/build/builder.py b/build/builder.py
@@ -21,6 +21,7 @@
 
 from build.model import Transformer
 from build.utils import device_sync, is_cpu_device, is_cuda_or_cpu_device, name_to_dtype
+from distributed import parallelize_llama, ParallelDims, ParallelConfig
 
 
 @dataclass
@@ -36,7 +37,7 @@ class BuilderArgs:
     device: Optional[str] = None
     precision: torch.dtype = torch.float32
     setup_caches: bool = False
-    use_tp: bool = False
+    use_distributed: bool = False
     is_chat_model: bool = False
     prefill_possible: bool = False
 
@@ -141,7 +142,7 @@ def from_args(cls, args):  # -> BuilderArgs:
             device=args.device,
             precision=dtype,
             setup_caches=(args.output_dso_path or args.output_pte_path),
-            use_tp=False,
+            use_distributed=False,
             is_chat_model=is_chat_model,
         )
 
@@ -346,11 +347,21 @@ def _load_model(builder_args, only_config=False):
     else:
         model = _load_model_default(builder_args)
 
-    if builder_args.use_tp:
-        from tp import apply_tp
+    if builder_args.use_distributed:
+        # init distributed
+        world_size = int(os.environ["WORLD_SIZE"])
+        parallel_config = ParallelConfig()
+        parallel_dims = ParallelDims(
+            tp=parallel_config.tp_degree,
+            pp=parallel_config.pp_degree,
+            world_size=world_size,
+        )
+        device = torch.device(f"cuda:{int(os.environ['LOCAL_RANK'])}")
+        torch.cuda.set_device(device)
+        init_distributed(job_config)
 
-        print("Applying tensor parallel to model ...")
-        apply_tp(model)
+        print("Applying model parallel to model ...")
+        parallelize_llama(model)
 
     model = model.to(device=builder_args.device, dtype=builder_args.precision)
     return model.eval()
diff --git a/distributed/__init__.py b/distributed/__init__.py
@@ -0,0 +1,2 @@
+from distributed.parallelize_llama import parallelize_llama
+from distributed.parallel_config import ParallelConfig, ParallelDims
diff --git a/distributed/parallel_config.py b/distributed/parallel_config.py
@@ -0,0 +1,48 @@
+from dataclasses import dataclass, field
+from torch.distributed.device_mesh import init_device_mesh
+
+@dataclass
+class ParallelConfig:
+    name: str = field(default="")
+    fp8_linear: str = field(default="")
+    tp_degree: int = field(default=1)
+    pp_degree: int = field(default=1)
+
+
+@dataclass
+class ParallelDims:
+    tp: int
+    pp: int
+    world_size: int
+
+    def __post_init__(self):
+        self._validate()
+
+    def _validate(self):
+        tp, pp = self.tp, self.pp
+        assert tp >= 1, tp
+        assert pp >= 1, pp
+        assert (
+            tp * pp == self.world_size
+        ), f"Invalid parallel dims: tp({tp}) * pp({pp}) != WORLD_SIZE({self.world_size})"
+
+    def build_mesh(self, device_type):
+        dims = []
+        names = []
+        for d, name in zip(
+            [self.pp, self.tp], ["pp", "tp"], strict=True
+        ):
+            if d > 1:
+                dims.append(d)
+                names.append(name)
+        logger.info(f"Building {len(dims)}-D device mesh with {names}, {dims}")
+        names = tuple(names)
+        return init_device_mesh(device_type, dims, mesh_dim_names=names)
+
+    @property
+    def tp_enabled(self):
+        return self.tp > 1
+
+    @property
+    def pp_enabled(self):
+        return self.pp > 1
diff --git a/distributed/parallelize_llama.py b/distributed/parallelize_llama.py
@@ -0,0 +1,119 @@
+from typing import Tuple
+from torch.distributed.tensor.parallel import (
+    ColwiseParallel,
+    parallelize_module,
+    PrepareModuleInput,
+    RowwiseParallel,
+    SequenceParallel,
+)
+
+from distributed.parallel_config import ParallelConfig
+
+
+def get_tp_parallel_strategy(
+    config: ParallelConfig,
+) -> Tuple[RowwiseParallel, ColwiseParallel, PrepareModuleInput]:
+    """Get the parallel strategy for the transformer model.
+
+    This function handles the special case of using float8 with tensor parallelism.
+    """
+    if config.fp8_linear == "dynamic":
+        from float8_experimental.float8_tensor_parallel import (
+            Float8ColwiseParallel,
+            Float8RowwiseParallel,
+            PrepareFloat8ModuleInput,
+        )
+
+        return Float8RowwiseParallel, Float8ColwiseParallel, PrepareFloat8ModuleInput
+    return RowwiseParallel, ColwiseParallel, PrepareModuleInput
+
+
+def apply_tp(model, world_mesh, parallel_dims, config: ParallelConfig):
+    """
+    Apply tensor parallelism.
+    """
+
+    tp_mesh = world_mesh["tp"]
+    (
+        row_parallel_strategy,
+        col_parallel_strategy,
+        prepare_module_input,
+    ) = get_tp_parallel_strategy(config)
+    loss_parallel = parallel_dims.loss_parallel_enabled
+
+    # 1. Parallelize the first embedding and the last linear proj layer
+    # 2. Parallelize the root norm layer over the sequence dim
+    # 3. Shard the first transformer block's inputs
+    model = parallelize_module(
+        model,
+        tp_mesh,
+        {
+            "tok_embeddings": RowwiseParallel(
+                input_layouts=Replicate(),
+                output_layouts=Shard(1),
+            ),
+            "output": col_parallel_strategy(
+                input_layouts=Shard(1),
+                output_layouts=Shard(-1) if loss_parallel else Replicate(),
+                use_local_output=not loss_parallel,
+            ),
+            "norm": SequenceParallel(),
+        },
+    )
+
+    # Apply tensor + sequence parallelism to every transformer block
+    for layer_id, transformer_block in model.layers.items():
+        layer_plan = {
+            "attention": prepare_module_input(
+                input_layouts=(Shard(1), None),
+                desired_input_layouts=(Replicate(), None),
+            ),
+            "attention.wq": col_parallel_strategy(),
+            "attention.wk": col_parallel_strategy(),
+            "attention.wv": col_parallel_strategy(),
+            "attention.wo": row_parallel_strategy(output_layouts=Shard(1)),
+            "attention_norm": SequenceParallel(),
+            "feed_forward": prepare_module_input(
+                input_layouts=(Shard(1),),
+                desired_input_layouts=(Replicate(),),
+            ),
+            "feed_forward.w1": col_parallel_strategy(),
+            "feed_forward.w2": row_parallel_strategy(output_layouts=Shard(1)),
+            "feed_forward.w3": col_parallel_strategy(),
+            "ffn_norm": SequenceParallel(),
+        }
+
+        # Adjust attention module to use the local number of heads
+        attn_layer = transformer_block.attention
+        attn_layer.n_heads = attn_layer.n_heads // tp_mesh.size()
+        attn_layer.n_kv_heads = attn_layer.n_kv_heads // tp_mesh.size()
+
+        parallelize_module(
+            module=transformer_block,
+            device_mesh=tp_mesh,
+            parallelize_plan=layer_plan,
+        )
+
+    logger.info("Applied Tensor Parallelism to the model")
+    return model
+
+
+
+
+def parallelize_llama(model, world_mesh, parallel_dims, config: ParallelConfig):
+    """
+    Apply tensor parallelism, activation checkpointing, torch.compile, and data
+    parallelism to the model.
+
+    NOTE: The passed-in model preferably should be on meta device. Otherwise,
+    the model must fit on GPU or CPU memory.
+    """
+
+    if parallel_dims.tp_enabled:
+        model = apply_tp(model, world_mesh, parallel_dims, job_config)
+
+    # only enable TP for now.
+    # if job_config.training.compile:
+    #     model = apply_compile(model, job_config)
+
+    return model
diff --git a/distributed/utils.py b/distributed/utils.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from distributed.parallelize_llama import parallelize_llama`
	`2`	`+from distributed.parallel_config import ParallelConfig, ParallelDims`