chore: updates

peri044 · cehongwang · commit a015538e1f84 · 2025-02-13T00:23:28.000Z
diff --git a/examples/dynamo/flux.py b/examples/dynamo/flux.py
@@ -180,7 +180,9 @@ def forward(
                         use_fp32_acc=use_fp32_acc,
                     )
                     trt_end = time.time()
+                    config = pipe.transformer.config
                     pipe.transformer = trt_model
+                    pipe.transformer.config = config
 
                     free, total = torch.cuda.mem_get_info(cuda_device)
                     print(
diff --git a/examples/dynamo/torch_export_flux_fp8.py b/examples/dynamo/torch_export_flux_fp8.py
@@ -0,0 +1,83 @@
+import modelopt.torch.opt as mto
+import modelopt.torch.quantization as mtq
+import torch
+import torch_tensorrt
+from diffusers import FluxPipeline
+from modelopt.torch.quantization.utils import export_torch_mode
+
+# from onnx_utils.export import generate_dummy_inputs
+from torch.export._trace import _export
+
+
+def generate_image(pipe, prompt, image_name):
+    seed = 42
+    image = pipe(
+        prompt,
+        output_type="pil",
+        num_inference_steps=20,
+        generator=torch.Generator("cuda").manual_seed(seed),
+    ).images[0]
+    image.save(f"{image_name}.png")
+    print(f"Image generated using {image_name} model saved as {image_name}.png")
+
+
+device = "cuda"
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=torch.float16,
+)
+
+pipe.to(device)
+backbone = pipe.transformer
+
+# Restore FP8 weights
+mto.restore(backbone, "./schnell_fp8.pt")
+
+# dummy_inputs = generate_dummy_inputs("flux-dev", "cuda", True)
+batch_size = 1
+BATCH = torch.export.Dim("batch", min=1, max=2)
+SEQ_LEN = torch.export.Dim("seq_len", min=1, max=256)
+dynamic_shapes = (
+    {0: BATCH},
+    {0: BATCH, 1: SEQ_LEN},
+    {0: BATCH},
+    {0: BATCH},
+    {0: BATCH},
+    {0: BATCH, 1: SEQ_LEN},
+)
+
+dummy_inputs = (
+    torch.randn((batch_size, 4096, 64), dtype=torch.float16).to(device),
+    torch.randn((batch_size, 256, 4096), dtype=torch.float16).to(device),
+    torch.randn((batch_size, 768), dtype=torch.float16).to(device),
+    torch.tensor([1.0, 1.0], dtype=torch.float16).to(device),
+    torch.randn((batch_size, 4096, 3), dtype=torch.float16).to(device),
+    torch.randn((batch_size, 256, 3), dtype=torch.float16).to(device),
+)
+with export_torch_mode():
+    ep = _export(
+        backbone,
+        dummy_inputs,
+        dynamic_shapes=dynamic_shapes,
+        strict=False,
+        allow_complex_guards_as_runtime_asserts=True,
+    )
+
+with torch_tensorrt.logging.debug():
+    trt_gm = torch_tensorrt.dynamo.compile(
+        ep,
+        inputs=dummy_inputs,
+        enabled_precisions={torch.float8_e4m3fn, torch.float16},
+        truncate_double=True,
+        dryrun=True,
+        debug=True,
+    )
+
+
+backbone.to("cpu")
+config = pipe.transformer.config
+pipe.transformer = trt_gm
+pipe.transformer.config = config
+
+# Generate an image
+generate_image(pipe, "A cat holding a sign that says hello world", "flux-dev")
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py b/py/torch_tensorrt/dynamo/lowering/passes/remove_assert_scalar.py
@@ -15,7 +15,10 @@ def remove_assert_scalar(
     """Remove assert_scalar ops in the graph"""
     count = 0
     for node in gm.graph.nodes:
-        if node.target == torch.ops.aten._assert_scalar.default:
+        if (
+            node.target == torch.ops.aten._assert_scalar.default
+            or node == torch.ops.aten._assert_tensor_metadata.default
+        ):
             gm.graph.erase_node(node)
             count += 1
 
diff --git a/py/torch_tensorrt/dynamo/utils.py b/py/torch_tensorrt/dynamo/utils.py
@@ -246,9 +246,10 @@ def prepare_inputs(
     if isinstance(inputs, Input):
         return inputs
 
-    elif isinstance(inputs, torch.Tensor):
+    elif isinstance(inputs, (torch.Tensor, int, float, bool)):
         return Input.from_tensor(
-            inputs, disable_memory_format_check=disable_memory_format_check
+            torch.tensor(inputs),
+            disable_memory_format_check=disable_memory_format_check,
         )
 
     elif isinstance(inputs, (list, tuple)):
@@ -395,8 +396,8 @@ def unwrap_tensor_dtype(tensor: Union[torch.Tensor, FakeTensor, torch.SymInt]) -
     """
     Returns the dtype of torch.tensor or FakeTensor. For symbolic integers, we return int64
     """
-    if isinstance(tensor, (torch.Tensor, FakeTensor)):
-        return tensor.dtype
+    if isinstance(tensor, (torch.Tensor, FakeTensor, int, float, bool)):
+        return torch.tensor(tensor).dtype
     elif isinstance(tensor, torch.SymInt):
         return torch.int64
     else:

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,9 @@ def forward(`
`180`	`180`	`use_fp32_acc=use_fp32_acc,`
`181`	`181`	`)`
`182`	`182`	`trt_end = time.time()`
	`183`	`+ config = pipe.transformer.config`
`183`	`184`	`pipe.transformer = trt_model`
	`185`	`+ pipe.transformer.config = config`
`184`	`186`
`185`	`187`	`free, total = torch.cuda.mem_get_info(cuda_device)`
`186`	`188`	`print(`