backend changes, addressing review comments

apbose · apbose · commit bba415398176 · 2024-12-20T16:45:46.000-08:00
diff --git a/examples/distributed_inference/README.md b/examples/distributed_inference/README.md
@@ -18,15 +18,33 @@ torchrun --nproc_per_node=2 tensor_parallel_llama2.py
 3. Tensor parallel distributed inference using nccl ops plugin
 
  apt install libmpich-dev
+
  apt install libopenmpi-dev
+
+ #For python3.10
+
  pip install tensorrt-llm
+
+ For other python versions, you need to load the libnvinfer_plugin_tensorrt_llm.so. Please set that in the environment variable export trtllm_env={lib_path}. For example, we have already set the variable in initialize_distributed_env(). Note that won't work while running example, since it needs to be preset for the converter library to get.
+
  #then pip install the tensorrt and torch version compatible with installed torchTRT
+
  mpirun -n 2 --allow-run-as-root python tensor_parallel_simple_example.py
 
+ #For other python
+
 4. Tensor parallel distributed llama3 inference using nccl ops plugin
 
  apt install libmpich-dev
+
  apt install libopenmpi-dev
+
+#For python3.10
+
  pip install tensorrt-llm
+
+ For other python versions, you need to load the libnvinfer_plugin_tensorrt_llm.so
+
  #then pip install the tensorrt and torch version compatible with installed torchTRT
+
  mpirun -n 2 --allow-run-as-root python tensor_parallel_llama3.py
diff --git a/examples/distributed_inference/requirement.txt b/examples/distributed_inference/requirement.txt
@@ -1,5 +1,4 @@
 accelerate
 transformers
 diffusers
-site
 tensorrt-llm
diff --git a/examples/distributed_inference/tensor_parallel_llama3.py b/examples/distributed_inference/tensor_parallel_llama3.py
@@ -48,6 +48,7 @@
             "use_python_runtime": True,
             "workspace_size": 1 << 33,
             "debug": False,
+            "use_aot_joint_export": False,
         },
         dynamic=False,
     )
diff --git a/examples/distributed_inference/tensor_parallel_nccl_ops.py b/examples/distributed_inference/tensor_parallel_nccl_ops.py
@@ -19,8 +19,8 @@
     dynamo_tensorrt_converter,
 )
 from torch_tensorrt.dynamo.lowering.passes.fuse_distributed_ops import (
-    custom_fused_all_gather_op,
-    custom_fused_reduce_scatter_op,
+    tensorrt_fused_nccl_all_gather_op,
+    tensorrt_fused_nccl_reduce_scatter_op,
 )
 from torch_tensorrt.dynamo.types import TRTTensor
 from torch_tensorrt.fx.converters.converter_utils import set_layer_name
@@ -105,7 +105,7 @@ def register_nccl_ops(logger_file_name):
             f"Plugin Name: {plugin_creator.name}, Namespace: {plugin_creator.plugin_namespace}, Version: {plugin_creator.plugin_version}"
         )
 
-    @dynamo_tensorrt_converter(custom_fused_all_gather_op)
+    @dynamo_tensorrt_converter(tensorrt_fused_nccl_all_gather_op)
     def insert_nccl_gather_op(
         ctx: ConversionContext,
         target: Target,
@@ -118,12 +118,18 @@ def insert_nccl_gather_op(
             "AllGather", "1", "tensorrt_llm"
         )
         assert allgather_plg_creator is not None
-        _world_size = int(os.environ["WORLD_SIZE"])
+        _world_size = os.environ.get("WORLD_SIZE")
+        if _world_size is not None:
+            _world_size = int(_world_size)
+        else:
+            raise RuntimeError(
+                f"The WORLD_SIZE env variable is not set in distributed environment"
+            )
         group = list(range(_world_size))
         group = trt.PluginField(
             "group", np.array(group, dtype=np.int32), trt.PluginFieldType.INT32
         )
-        p_dtype = trt.float16
+        p_dtype = trt.float32
         pf_type = trt.PluginField(
             "type_id", np.array([int(p_dtype)], np.int32), trt.PluginFieldType.INT32
         )
@@ -133,7 +139,7 @@ def insert_nccl_gather_op(
         set_layer_name(layer, target, name)
         return layer.get_output(0)
 
-    @dynamo_tensorrt_converter(custom_fused_reduce_scatter_op)
+    @dynamo_tensorrt_converter(tensorrt_fused_nccl_reduce_scatter_op)
     def insert_nccl_reduce_scatter_plugin(
         ctx: ConversionContext,
         target: Target,
@@ -151,9 +157,14 @@ def insert_nccl_reduce_scatter_plugin(
         counter = 0
         strategy = AllReduceStrategy.NCCL
         config = AllReduceConfig(0)
-
-        world_size = dist.get_world_size()
-        group = list(range(world_size))
+        _world_size = os.environ.get("WORLD_SIZE")
+        if _world_size is not None:
+            _world_size = int(_world_size)
+        else:
+            raise RuntimeError(
+                f"The WORLD_SIZE env variable is not set in distributed environment"
+            )
+        group = list(range(_world_size))
         group = trt.PluginField(
             "group", np.array(group, dtype=np.int32), trt.PluginFieldType.INT32
         )
diff --git a/examples/distributed_inference/tensor_parallel_simple_example.py b/examples/distributed_inference/tensor_parallel_simple_example.py
@@ -75,6 +75,7 @@ def forward(self, x):
         "enabled_precisions": {torch.float32, torch.float16},
         "use_python_runtime": True,
         "min_block_size": 1,
+        "use_aot_joint_export": False,
     },
     dynamic=False,
 )
diff --git a/py/torch_tensorrt/dynamo/_defaults.py b/py/torch_tensorrt/dynamo/_defaults.py
@@ -46,6 +46,7 @@
 IMMUTABLE_WEIGHTS = True
 ENABLE_WEIGHT_STREAMING = False
 ENABLE_CROSS_COMPILE_FOR_WINDOWS = False
+USE_AOT_JOINT_EXPORT = True
 
 
 def default_device() -> Device:
diff --git a/py/torch_tensorrt/dynamo/_settings.py b/py/torch_tensorrt/dynamo/_settings.py
@@ -33,6 +33,7 @@
     STRIP_ENGINE_WEIGHTS,
     TIMING_CACHE_PATH,
     TRUNCATE_DOUBLE,
+    USE_AOT_JOINT_EXPORT,
     USE_EXPLICIT_TYPING,
     USE_FAST_PARTITIONER,
     USE_FP32_ACC,
@@ -91,6 +92,7 @@ class CompilationSettings:
         enable_weight_streaming (bool): Enable weight streaming.
         enable_cross_compile_for_windows (bool): By default this is False means TensorRT engines can only be executed on the same platform where they were built.
             True will enable cross-platform compatibility which allows the engine to be built on Linux and run on Windows
+        use_aot_joint_export (bool): Use aot_export_joint_simple, else wrap backend with AOT_autograd, required for distributed tensors
     """
 
     enabled_precisions: Set[dtype] = field(default_factory=lambda: ENABLED_PRECISIONS)
@@ -131,6 +133,7 @@ class CompilationSettings:
     immutable_weights: bool = IMMUTABLE_WEIGHTS
     enable_weight_streaming: bool = ENABLE_WEIGHT_STREAMING
     enable_cross_compile_for_windows: bool = ENABLE_CROSS_COMPILE_FOR_WINDOWS
+    use_aot_joint_export: bool = USE_AOT_JOINT_EXPORT
 
 
 _SETTINGS_TO_BE_ENGINE_INVARIANT = (
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -1,11 +1,13 @@
 from __future__ import annotations
 
+import functools
 import logging
 import unittest
 from typing import Any, Callable, Sequence
 
 import torch
 import torch._dynamo as td
+from torch._dynamo.backends.common import aot_autograd
 from torch._dynamo.utils import detect_fake_mode
 from torch._functorch.aot_autograd import aot_export_joint_simple
 from torch_tensorrt.dynamo import CompilationSettings
@@ -49,7 +51,19 @@ def aot_torch_tensorrt_aten_backend(
     gm: torch.fx.GraphModule, sample_inputs: Sequence[Any], **kwargs: Any
 ) -> torch.nn.Module:
     settings, engine_cache = parse_dynamo_kwargs(kwargs)
-    return _pretraced_backend(gm, sample_inputs, settings, engine_cache)
+    if settings.use_aot_joint_export:
+        return _pretraced_backend(gm, sample_inputs, settings, engine_cache)
+    logger.debug("Wrapping the backend with aot_autograd\n")
+    _pretraced_backend_autograd = functools.partial(
+        _pretraced_backend, settings=settings, engine_cache=engine_cache
+    )
+    settings_aot_autograd = {}
+    settings_aot_autograd["decompostions"] = get_decompositions(
+        settings.enable_experimental_decompositions
+    )
+    return aot_autograd(fw_compiler=_pretraced_backend_autograd)(
+        gm, sample_inputs, **settings_aot_autograd
+    )
 
 
 def _pretraced_backend(
@@ -90,14 +104,15 @@ def _pretraced_backend(
             remove_detach(gm, settings)
 
             # Invoke AOTAutograd to translate operators to aten
-            gm = aot_export_joint_simple(
-                gm,
-                sample_inputs,
-                trace_joint=False,
-                decompositions=get_decompositions(
-                    settings.enable_experimental_decompositions
-                ),
-            )
+            if settings.use_aot_joint_export:
+                gm = aot_export_joint_simple(
+                    gm,
+                    sample_inputs,
+                    trace_joint=False,
+                    decompositions=get_decompositions(
+                        settings.enable_experimental_decompositions
+                    ),
+                )
 
             logger.debug("Post-AOT Autograd graph:\n" + str(gm.graph))
 
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/fuse_distributed_ops.py b/py/torch_tensorrt/dynamo/lowering/passes/fuse_distributed_ops.py
@@ -12,13 +12,13 @@
 logger = logging.getLogger(__name__)
 
 
-def custom_fused_all_gather_op(args0, args1, args2):
+def tensorrt_fused_nccl_all_gather_op(args0, args1, args2):
     return torch.ops._c10d_functional.wait_tensor.default(
         torch.ops._c10d_functional.all_gather_into_tensor.default(args0, args1, args2)
     )
 
 
-def custom_fused_reduce_scatter_op(args0, args1, args2, args3):
+def tensorrt_fused_nccl_reduce_scatter_op(args0, args1, args2, args3):
     return torch.ops._c10d_functional.wait_tensor.default(
         torch.ops._c10d_functional.reduce_scatter_tensor.default(
             args0, args1, args2, args3
@@ -44,10 +44,10 @@ def fuse_distributed_ops(
             wait_tensor_node = list(node.users)[0]
             fused_op = None
             if node.target == torch.ops._c10d_functional.all_gather_into_tensor.default:
-                fused_op = custom_fused_all_gather_op
+                fused_op = tensorrt_fused_nccl_all_gather_op
                 fused_op_args = (node.args[0], node.args[1], node.args[2])
             else:
-                fused_op = custom_fused_reduce_scatter_op
+                fused_op = tensorrt_fused_nccl_reduce_scatter_op
                 fused_op_args = (node.args[0], node.args[1], node.args[2], node.args[3])
             with gm.graph.inserting_after(wait_tensor_node):
                 fused_node = gm.graph.create_node(

Original file line number	Diff line number	Diff line change
`@@ -48,6 +48,7 @@`
`48`	`48`	`"use_python_runtime": True,`
`49`	`49`	`"workspace_size": 1 << 33,`
`50`	`50`	`"debug": False,`
	`51`	`+ "use_aot_joint_export": False,`
`51`	`52`	`},`
`52`	`53`	`dynamic=False,`
`53`	`54`	`)`
Original file line number	Diff line number	Diff line change
`@@ -75,6 +75,7 @@ def forward(self, x):`
`75`	`75`	`"enabled_precisions": {torch.float32, torch.float16},`
`76`	`76`	`"use_python_runtime": True,`
`77`	`77`	`"min_block_size": 1,`
	`78`	`+ "use_aot_joint_export": False,`
`78`	`79`	`},`
`79`	`80`	`dynamic=False,`
`80`	`81`	`)`