ruff and isort

lessw2020 · lessw2020 · commit f2a8a406d4ee · 2024-07-12T11:14:44.000-07:00
diff --git a/distributed/__init__.py b/distributed/__init__.py
@@ -4,9 +4,9 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from distributed.parallelize_llama import parallelize_llama
-from distributed.parallel_config import ParallelDims
-from distributed.utils import init_distributed
 from distributed.checkpoint import load_checkpoints_to_model
-from distributed.world_maker import launch_distributed 
 from distributed.logging_utils import logger
+from distributed.parallel_config import ParallelDims
+from distributed.parallelize_llama import parallelize_llama
+from distributed.utils import init_distributed
+from distributed.world_maker import launch_distributed
diff --git a/distributed/checkpoint.py b/distributed/checkpoint.py
@@ -8,8 +8,8 @@
 from typing import Any, Mapping
 
 import torch
-import torch.nn as nn
 import torch.distributed.checkpoint as dist_cp
+import torch.nn as nn
 from torch.distributed._tensor import DTensor, Replicate, Shard
 from torch.distributed.device_mesh import DeviceMesh
 
diff --git a/distributed/config_manager.py b/distributed/config_manager.py
@@ -5,15 +5,15 @@
 # LICENSE file in the root directory of this source tree.
 
 import argparse
-import sys
-from collections import defaultdict
-from typing import Tuple, Union
 import os
-from distributed.logging_utils import logger
+from collections import defaultdict
 from pathlib import Path
+from typing import Tuple
 
 import torch
 
+from distributed.logging_utils import logger
+
 try:
     import tomllib
 except ModuleNotFoundError:
diff --git a/distributed/inference.py b/distributed/inference.py
@@ -4,29 +4,17 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import contextlib
 import os
-import time
-
-from dataclasses import dataclass, field
-from datetime import timedelta
-from io import BytesIO
-from timeit import default_timer as timer
-from typing import Any, Dict, List
-
-import numpy as np
 
 import torch
-import torch.nn.functional as F
-from torch.distributed import destroy_process_group
-from torch.distributed.checkpoint.stateful import Stateful
-from torch.distributed.elastic.multiprocessing.errors import record
-from torch.distributed.tensor.parallel import loss_parallel
-
 from daylight.config_manager import JobConfig
 #from daylight.datasets import build_hf_data_loader, create_tokenizer
 #from daylight.float8_linear import build_fp8_linear
 from daylight.logging_utils import init_logger, logger
+#from daylight.parallelisms.pipelining_utils import build_pipeline_schedule
+#from daylight.profiling import maybe_enable_memory_snapshot, maybe_enable_profiling
+from daylight.utils import Color, NoColor, init_distributed
+from torch.distributed import destroy_process_group
 
 #from daylight.metrics import build_gpu_memory_monitor, build_metric_logger
 #from daylight.models import model_name_to_cls, model_name_to_tokenizer, models_config
@@ -36,14 +24,6 @@
 #    ParallelDims,
 #)
 
-#from daylight.parallelisms.pipelining_utils import build_pipeline_schedule
-#from daylight.profiling import maybe_enable_memory_snapshot, maybe_enable_profiling
-from daylight.utils import (
-    Color,
-    init_distributed,
-    NoColor,
-    set_pg_timeouts,
-)
 
 def main(job_config: JobConfig):
     init_logger()
@@ -71,7 +51,7 @@ def main(job_config: JobConfig):
 
 
 if __name__ == "__main__":
-    print(f"Daylight starting...")
+    print("Daylight starting...")
     config = JobConfig()
     config.parse_args()
     main(config)
diff --git a/distributed/inference_configs/llama3_8B.toml b/distributed/inference_configs/llama3_8B.toml
@@ -23,15 +23,19 @@ flavor = "8B"
 tokenizer_path = "./test/assets/test_tiktoken.model"
 dtype = "bfloat16"
 
+[parallel]
+pipeline_parallel_degree = 1
+tensor_parallel_degree = 2
+
 [inference]
 batch_size = 8
 seq_len = 2048
 reps=1  # for profiling inference runs, can run repeatedly
 data_parallel_degree = -1
-tensor_parallel_degree = 1
+
 fp8_linear = ""
 compile = false
-pipeline_parallel_degree = 1
+
 enable_async_tensor_parallel=false
 pipeline_parallel_split_points= "layers.4"  # string list of placements
 pipeline_parallel_schedule="gpipe"  # TODO - what is best inference schedule for continous batching
diff --git a/distributed/logging_utils.py b/distributed/logging_utils.py
@@ -7,7 +7,6 @@
 import logging
 import os
 
-
 logger = logging.getLogger()
 
 
diff --git a/distributed/parallel_config.py b/distributed/parallel_config.py
@@ -4,9 +4,12 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from dataclasses import dataclass, field
+from dataclasses import dataclass
+
 from torch.distributed.device_mesh import init_device_mesh
 
+from distributed.logging_utils import logger
+
 
 @dataclass
 class ParallelDims:
diff --git a/distributed/parallelize_llama.py b/distributed/parallelize_llama.py
@@ -4,19 +4,14 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import Tuple
-from torch.distributed.tensor.parallel import (
-    ColwiseParallel,
-    parallelize_module,
-    PrepareModuleInput,
-    RowwiseParallel,
-)
-
 import torch.nn as nn
-from torch.distributed._tensor import Replicate, Shard
-from distributed.parallel_config import ParallelDims
 from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor.parallel import (ColwiseParallel,
+                                               RowwiseParallel,
+                                               parallelize_module)
+
 from distributed.logging_utils import logger
+from distributed.parallel_config import ParallelDims
 
 
 def apply_tp(
diff --git a/distributed/utils.py b/distributed/utils.py
@@ -5,12 +5,14 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
+from dataclasses import dataclass
 from datetime import timedelta
 
 import torch
-from dataclasses import dataclass, field
+
 from distributed.logging_utils import logger
 
+
 def _warn_overwrite_env(env, val):
     if env in os.environ:
         logger.warning(
diff --git a/distributed/world_maker.py b/distributed/world_maker.py
@@ -4,33 +4,16 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import contextlib
 import os
-import time
+from typing import Optional, Tuple
 
-from dataclasses import dataclass, field
-from datetime import timedelta
-from io import BytesIO
-from timeit import default_timer as timer
-from typing import Any, Dict, List, Tuple, Optional
-
-import numpy as np
-
-import torch
-import torch.nn.functional as F
-from torch.distributed import destroy_process_group
-from torch.distributed.checkpoint.stateful import Stateful
-from torch.distributed.elastic.multiprocessing.errors import record
-from torch.distributed.tensor.parallel import loss_parallel
-import torch.nn as nn
-from torch.distributed._tensor import Replicate, Shard
-from distributed.parallel_config import ParallelDims
 from torch.distributed.device_mesh import DeviceMesh
 
+from distributed.logging_utils import logger
+from distributed.parallel_config import ParallelDims
+from distributed.utils import init_distributed
 
 from .config_manager import InferenceConfig
-from distributed.logging_utils import init_logger, logger
-
 
 
 def launch_distributed(
@@ -57,13 +40,16 @@ def launch_distributed(
 
     
     logger.info(f"toml parsing completed.  Launching with {world_size} GPUs")
-
+    # review parallel config
+    tp = config.parallel.tensor_parallel_degree
+    pp = config.parallel.pipeline_parallel_degree
     
     parallel_dims = ParallelDims(
-        tp=8,
-        pp=1,
+        tp=tp,
+        pp=pp,
         world_size=world_size,
     )
     init_distributed()
     world_mesh = parallel_dims.build_mesh(device_type="cuda")
-    assert False, "--- function end"
+    logger.info(f"world_mesh created: {world_mesh}")
+    return world_mesh, parallel_dims