Implementing the Option 1-warning recommending the users to use use_distributed_mode_trace=True

apbose · apbose · commit ddf72cd3a8eb · 2025-03-13T23:07:43.000-07:00
diff --git a/examples/distributed_inference/tensor_parallel_simple_example.py b/examples/distributed_inference/tensor_parallel_simple_example.py
@@ -21,6 +21,35 @@
 """
 
 
+def compile_tp_model(tp_model, backend):
+    compile_options = {
+        "truncate_long_and_double": True,
+        "enabled_precisions": {torch.float32, torch.float16},
+        "use_python_runtime": True,
+        "min_block_size": 1,
+    }
+
+    try:
+        return torch.compile(
+            tp_model, backend=backend, options=compile_options, dynamic=False
+        )
+    except RuntimeError as e:
+        if (
+            "aot_export is not currently supported with traceable tensor subclass"
+            in str(e)
+        ):
+            logger.warning(
+                "It is recommended to run the model with use_distributed_mode_trace=True. Running with that option"
+            )
+            compile_options["use_distributed_mode_trace"] = True
+            return torch.compile(
+                tp_model, backend=backend, options=compile_options, dynamic=False
+            )
+        else:
+            logger.debug("The distributed model fails with the following error")
+            raise
+
+
 class ToyModel(nn.Module):
     """MLP based model"""
 
@@ -64,20 +93,7 @@ def forward(self, x):
 inp = torch.rand(20, 10, device="cuda")
 python_result = tp_model(inp)
 
-
-backend = "torch_tensorrt"
-tp_model = torch.compile(
-    tp_model,
-    backend=backend,
-    options={
-        "truncate_long_and_double": True,
-        "enabled_precisions": {torch.float32, torch.float16},
-        "use_python_runtime": True,
-        "min_block_size": 1,
-        "use_distributed_mode_trace": True,
-    },
-    dynamic=False,
-)
+compile_tp_model(tp_model, backend="torch_tensorrt")
 
 for i in range(10):
     # For TP, input needs to be same across all TP ranks.
diff --git a/tests/py/dynamo/distributed/test_distributed_simple_example.py b/tests/py/dynamo/distributed/test_distributed_simple_example.py
@@ -16,6 +16,36 @@
     "./tensor_parallel_simple_example"
 )
 
+
+def compile_tp_model(tp_model, backend):
+    compile_options = {
+        "truncate_long_and_double": True,
+        "enabled_precisions": {torch.float32, torch.float16},
+        "use_python_runtime": True,
+        "min_block_size": 1,
+    }
+
+    try:
+        return torch.compile(
+            tp_model, backend=backend, options=compile_options, dynamic=False
+        )
+    except RuntimeError as e:
+        if (
+            "aot_export is not currently supported with traceable tensor subclass"
+            in str(e)
+        ):
+            logger.warning(
+                "It is recommended to run the model with use_distributed_mode_trace=True. Running with that option"
+            )
+            compile_options["use_distributed_mode_trace"] = True
+            return torch.compile(
+                tp_model, backend=backend, options=compile_options, dynamic=False
+            )
+        else:
+            logger.debug("The distributed model fails with the following error")
+            raise
+
+
 """
 This example copies some code from https://github.com/pytorch/examples/blob/main/distributed/tensor_parallelism/tensor_parallel_example.py
 """
@@ -60,20 +90,7 @@ def forward(self, x):
 inp = torch.rand(20, 10, device="cuda")
 python_result = tp_model(inp)
 
-
-backend = "torch_tensorrt"
-tp_model = torch.compile(
-    tp_model,
-    backend=backend,
-    options={
-        "truncate_long_and_double": True,
-        "enabled_precisions": {torch.float32, torch.float16},
-        "use_python_runtime": True,
-        "min_block_size": 1,
-        "use_aot_joint_export": False,
-    },
-    dynamic=False,
-)
+compile_tp_model(tp_model, backend="torch_tensorrt")
 
 for i in range(10):
     # For TP, input needs to be same across all TP ranks.
diff --git a/tests/py/dynamo/distributed/test_nccl_ops.sh b/tests/py/dynamo/distributed/test_nccl_ops.sh
@@ -88,11 +88,17 @@ fi
 URL="https://pypi.nvidia.com/tensorrt-llm/$FILE"
 echo "Downloading $FILE from $URL..."
 
-echo "Downloading ...."
+echo "Downloading here...."
 #Installing wget
 ensure_installed wget
-#Downloading the package
-wget "$URL"
+
+#Downloading the file
+filename=$(basename "$URL")
+if [ -f "$filename" ]; then
+    echo "File already exists: $filename"
+else
+    wget "$URL"
+fi
 echo "Download complete: $FILE"
 
 UNZIP_DIR="tensorrt_llm_unzip"