Remove unnecessary code and add comment

fduwjj · fduwjj · commit 1b8a8804af7f · 2024-07-01T12:13:47.000-07:00
diff --git a/distributed/__init__.py b/distributed/__init__.py
@@ -1,2 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 from distributed.parallelize_llama import parallelize_llama
 from distributed.parallel_config import ParallelConfig, ParallelDims
diff --git a/distributed/parallel_config.py b/distributed/parallel_config.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 from dataclasses import dataclass, field
 from torch.distributed.device_mesh import init_device_mesh
 
diff --git a/distributed/parallelize_llama.py b/distributed/parallelize_llama.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 from typing import Tuple
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -86,6 +92,7 @@ def apply_tp(model, world_mesh, parallel_dims, config: ParallelConfig):
         # Adjust attention module to use the local number of heads
         attn_layer = transformer_block.attention
         attn_layer.n_heads = attn_layer.n_heads // tp_mesh.size()
+        attn_layer.n_local_heads = attn_layer.n_local_heads // tp_mesh.size()
         attn_layer.n_kv_heads = attn_layer.n_kv_heads // tp_mesh.size()
 
         parallelize_module(
@@ -98,8 +105,6 @@ def apply_tp(model, world_mesh, parallel_dims, config: ParallelConfig):
     return model
 
 
-
-
 def parallelize_llama(model, world_mesh, parallel_dims, config: ParallelConfig):
     """
     Apply tensor parallelism, activation checkpointing, torch.compile, and data
diff --git a/distributed/utils.py b/distributed/utils.py
@@ -5,26 +5,9 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
-from dataclasses import dataclass
 from datetime import timedelta
-from typing import Union
 
 import torch
-import torch.distributed._functional_collectives as funcol
-import torch.distributed.distributed_c10d as c10d
-from torch.distributed.device_mesh import DeviceMesh
-from torchtitan.logging_utils import logger
-from torchtitan.parallelisms import ParallelDims
-
-
-def dist_max(x: Union[int, float], mesh: DeviceMesh) -> float:
-    tensor = torch.tensor(x).cuda()
-    return funcol.all_reduce(tensor, reduceOp=c10d.ReduceOp.MAX.name, group=mesh)
-
-
-def dist_mean(x: Union[int, float], mesh: DeviceMesh) -> float:
-    tensor = torch.tensor(x).cuda()
-    return funcol.all_reduce(tensor, reduceOp=c10d.ReduceOp.AVG.name, group=mesh)
 
 
 def _warn_overwrite_env(env, val):
@@ -35,24 +18,6 @@ def _warn_overwrite_env(env, val):
     os.environ[env] = val
 
 
-def get_metrics_rank(world_mesh: DeviceMesh, parallel_dims: ParallelDims) -> int:
-    """
-    Returns global rank 0 in non-pipeline-parallel configs, and returns the global
-    rank of the 0th rank in the last pipeline stage when pipeline parallelism is enabled.
-    """
-    if parallel_dims.pp_enabled:
-        assert (
-            world_mesh.mesh_dim_names[0] == "pp"
-        ), "get_metrics_rank assumes pp is the outer mesh dim"
-        pp_mesh = world_mesh["pp"]
-        pp_size = pp_mesh.size()
-        metrics_log_rank = int((world_mesh.size() // pp_size) * (pp_size - 1))
-    else:
-        metrics_log_rank = 0
-
-    return metrics_log_rank
-
-
 def set_pg_timeouts(timeout, world_mesh):
     """
     Sets the timeout for all PGs in the provided mesh, and the default (world) group.
@@ -111,72 +76,3 @@ def init_distributed(job_config):
     # async_op=True hold memory longer than they should
     # such as those in tensor parallelism
     os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
-
-
-def get_num_params(model: torch.nn.Module, exclude_embedding: bool = False) -> int:
-    num_params = sum(p.numel() for p in model.parameters())
-    if exclude_embedding:
-        num_params -= model.tok_embeddings.weight.numel()
-    return num_params
-
-
-def get_num_flop_per_token(num_params: int, model_config, seq_len) -> int:
-    l, h, q, t = (
-        model_config.n_layers,
-        model_config.n_heads,
-        model_config.dim // model_config.n_heads,
-        seq_len,
-    )
-    # Reasoning behind the factor of 12 for the self-attention part of the formula:
-    # 1. each self-attention has 2 matmul in the forward and 4 in the backward (6)
-    # 2. the flash attention does 1 more matmul recomputation in the backward
-    #    but recomputation should not be counted in calculating MFU           (+0)
-    # 3. each matmul performs 1 multiplication and 1 addition                 (*2)
-    # 4. we follow the convention and do not account for sparsity in causal attention
-    flop_per_token = 6 * num_params + 12 * l * h * q * t
-
-    return flop_per_token
-
-
-# hardcoded BF16 type peak flops for NVIDIA A100 and H100 GPU
-def get_peak_flops(device_name: str) -> int:
-    if "A100" in device_name:
-        # data from https://www.nvidia.com/en-us/data-center/a100/
-        return 312e12
-    elif "H100" in device_name:
-        # data from https://www.nvidia.com/en-us/data-center/h100/
-        # NOTE: Specifications are one-half lower without sparsity.
-        if "NVL" in device_name:
-            return 1979e12
-        elif "PCIe" in device_name:
-            return 756e12
-        else:  # for SXM and other variants
-            return 989e12
-    else:  # for other GPU types, assume A100
-        return 312e12
-
-
-@dataclass(frozen=True)
-class Color:
-    black = "\033[30m"
-    red = "\033[31m"
-    green = "\033[32m"
-    yellow = "\033[33m"
-    blue = "\033[34m"
-    magenta = "\033[35m"
-    cyan = "\033[36m"
-    white = "\033[37m"
-    reset = "\033[39m"
-
-
-@dataclass(frozen=True)
-class NoColor:
-    black = ""
-    red = ""
-    green = ""
-    yellow = ""
-    blue = ""
-    magenta = ""
-    cyan = ""
-    white = ""
-    reset = ""