better toml breakout, add tomli if python < 3.11

lessw2020 · lessw2020 · commit be7db92d601a · 2024-07-12T17:47:37.000-07:00
diff --git a/distributed/config_manager.py b/distributed/config_manager.py
@@ -82,17 +82,10 @@ def parse_args(self, config_file):
             logger.exception(f"Error details: {str(e)}")
             raise e
 
-        # override args dict with cmd_args
-        # cmd_args_dict = self._args_to_two_level_dict(cmd_args)
-        # for section, section_args in cmd_args_dict.items():
-        #     for k, v in section_args.items():
-        #         args_dict[section][k] = v
-        
         for k, v in args_dict.items():
             class_type = type(k.title(), (), v)
             setattr(self, k, class_type())
 
-        #self._validate_config()
 
     def _args_to_two_level_dict(self, args: argparse.Namespace) -> defaultdict:
         args_dict = defaultdict(defaultdict)
diff --git a/distributed/inference_configs/llama3_8B.toml b/distributed/inference_configs/llama3_8B.toml
@@ -26,17 +26,16 @@ dtype = "bfloat16"
 [parallel]
 pipeline_parallel_degree = 1
 tensor_parallel_degree = 2
+enable_async_tensor_parallel=false
 
 [inference]
 batch_size = 8
 seq_len = 2048
 reps=1  # for profiling inference runs, can run repeatedly
-data_parallel_degree = -1
-
 fp8_linear = ""
 compile = false
 
-enable_async_tensor_parallel=false
+[pipelining]
 pipeline_parallel_split_points= "layers.4"  # string list of placements
 pipeline_parallel_schedule="gpipe"  # TODO - what is best inference schedule for continous batching
 pipeline_parallel_split_mode = "manual"
diff --git a/distributed/utils.py b/distributed/utils.py
@@ -12,42 +12,13 @@
 
 from distributed.logging_utils import logger
 
-
 def _warn_overwrite_env(env, val):
     if env in os.environ:
         logger.warning(
             f"ENV[{env}] = {os.environ[env]} will be overridden to {val} based on job config"
         )
     os.environ[env] = val
 
-
-def set_pg_timeouts(timeout, world_mesh):
-    """
-    Sets the timeout for all PGs in the provided mesh, and the default (world) group.
-
-    Note: synchronizes via a barrier, before changing the timeouts. This is important, becuase
-    otherwise you may face a race where the slow rank has not reached the timeout reduction point
-    yet due to slow operations permitted under the old timeout value, but other faster ranks may
-    start issueing collectives under the new shorter timeout and then immediately timeout.
-    """
-    logger.info(
-        f"Synchronizing and adjusting timeout for all ProcessGroups to {timeout}"
-    )
-    # Ensure that all the ranks have reached the point of setting the new timeout-
-    # otherwise, some ranks may issue collectives with the new/shorter timeout and
-    # those may time out, before other ranks have finished with initialization done
-    # under the old/slow timeout.
-    torch.distributed.barrier()
-    torch.cuda.synchronize()
-
-    groups = [world_mesh.get_group(mesh_dim) for mesh_dim in range(world_mesh.ndim)]
-
-    # None represents the 'default' PG, not part of the mesh
-    groups.append(None)
-    for group in groups:
-        torch.distributed.distributed_c10d._set_pg_timeout(timeout, group)
-
-
 TRACE_BUFFER_SIZE = "TORCH_NCCL_TRACE_BUFFER_SIZE"
 TRACE_FILE = "TORCH_NCCL_DEBUG_INFO_TEMP_FILE"
 DUMP_ON_TIMEOUT = "TORCH_NCCL_DUMP_ON_TIMEOUT"
diff --git a/requirements.txt b/requirements.txt
@@ -19,6 +19,7 @@ numpy < 2.0
 gguf
 lm-eval==0.4.2
 blobfile
+tomli >= 1.1.0 ; python_version < "3.11"
 
 # Build tools
 wheel