fix(//tests/cpp): Fix the BERT C++ test

narendasan · narendasan · commit 1eebc04d29ff · 2024-08-05T19:23:27.000-07:00
Signed-off-by: Naren Dasan &lt;naren@narendasan.com&gt;
Signed-off-by: Naren Dasan &lt;narens@nvidia.com&gt;
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -72,14 +72,6 @@ TRTEngine::TRTEngine(
   set_rt_device(device_info);
 
   // Set active stream to non-default stream
-  auto current_stream = c10::cuda::getCurrentCUDAStream(device_info.id);
-  if (current_stream == c10::cuda::getDefaultCUDAStream(device_info.id)) {
-    active_stream = c10::cuda::getStreamFromPool(false, device_info.id);
-    c10::cuda::setCurrentCUDAStream(active_stream);
-  } else {
-    active_stream = current_stream;
-  }
-
   rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));
 
   name = slugify(mod_name);
diff --git a/core/runtime/TRTEngine.h b/core/runtime/TRTEngine.h
@@ -70,7 +70,8 @@ struct TRTEngine : torch::CustomClassHolder {
 
   // CUDAGraph-Related Functionality
   at::cuda::CUDAGraph cudagraph = {};
-  at::cuda::CUDAStream active_stream = c10::cuda::getDefaultCUDAStream();
+  at::cuda::CUDAStream engine_stream = c10::cuda::getDefaultCUDAStream();
+  at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
   std::string shape_key;
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -1,3 +1,4 @@
+#include "ATen/cuda/CUDAEvent.h"
 #include "c10/cuda/CUDAGuard.h"
 #include "c10/cuda/CUDAStream.h"
 
@@ -70,7 +71,7 @@ bool _cudagraphs_validate_shapes(std::vector<at::Tensor> inputs, c10::intrusive_
     new_shape_key_ss << "(";
     auto sizes = input.sizes();
     auto rank = input.sizes().size();
-    for (auto i = 0; i < rank; i++) {
+    for (size_t i = 0; i < rank; i++) {
       new_shape_key_ss << sizes[i];
       // For all but the final dimension in the shape key, add comma separator
       if (i < rank - 1) {
@@ -142,13 +143,13 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
             select_rt_device(compiled_engine->device_info, curr_device, compiled_engine->hardware_compatible);
         set_rt_device(device);
 
+        compiled_engine->caller_stream = c10::cuda::getCurrentCUDAStream(device.id);
         // Update active stream based on new device
-        auto current_stream = c10::cuda::getCurrentCUDAStream(device.id);
-        if (current_stream == c10::cuda::getDefaultCUDAStream(device.id)) {
-          compiled_engine->active_stream = c10::cuda::getStreamFromPool(false, device.id);
-          c10::cuda::setCurrentCUDAStream(compiled_engine->active_stream);
+        if (compiled_engine->caller_stream == c10::cuda::getDefaultCUDAStream(device.id)) {
+          compiled_engine->engine_stream = c10::cuda::getStreamFromPool(false, device.id);
+          c10::cuda::setCurrentCUDAStream(compiled_engine->engine_stream);
         } else {
-          compiled_engine->active_stream = current_stream;
+          compiled_engine->engine_stream = compiled_engine->caller_stream;
         }
 
         // Target device is new device
@@ -274,16 +275,23 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   if (!CUDAGRAPHS_MODE) {
     // If not in cudagraphs mode, proceed with enqueueV3 as normal
-    compiled_engine->exec_ctx->enqueueV3(compiled_engine->active_stream);
+    at::cuda::CUDAEvent caller_exec_complete;
+    caller_exec_complete.record(compiled_engine->caller_stream);
+    caller_exec_complete.block(compiled_engine->engine_stream);
+    compiled_engine->exec_ctx->enqueueV3(compiled_engine->engine_stream);
+    at::cuda::CUDAEvent trt_exec_complete;
+    trt_exec_complete.record(compiled_engine->engine_stream);
+    trt_exec_complete.block(compiled_engine->caller_stream);
   } else if (need_cudagraphs_record) {
     // If cudagraphs needs to record a graph, capture the enqueueV3 call in a graph
 
     // Cudagraphs cannot record on the current stream, so use an alternate
     c10::cuda::CUDAStream recording_stream = c10::cuda::getStreamFromPool(false, inputs[0].device().index());
     c10::cuda::CUDAStreamGuard guard(recording_stream);
 
-    compiled_engine->exec_ctx->enqueueV3(recording_stream);
-    recording_stream.synchronize();
+    at::cuda::CUDAEvent caller_exec_complete;
+    caller_exec_complete.record(compiled_engine->caller_stream);
+    caller_exec_complete.block(recording_stream);
 
     compiled_engine->cudagraph.capture_begin();
     compiled_engine->exec_ctx->enqueueV3(recording_stream);
@@ -294,7 +302,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
 
   } else {
     // If the cudagraph has already been recorded, copy the input buffers and replay it
-    for (auto i = 0; i < inputs.size(); i++) {
+    for (size_t i = 0; i < inputs.size(); i++) {
       compiled_engine->input_buffers[i].copy_(inputs[i], true);
     }
     compiled_engine->cudagraph.replay();
@@ -305,7 +313,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
   // In cudagraphs mode, the output buffers can be reused, so they must
   // be cloned before providing them to the user to avoid data corruption
   if (CUDAGRAPHS_MODE) {
-    for (auto i = 0; i < compiled_engine->output_buffers.size(); i++) {
+    for (size_t i = 0; i < compiled_engine->output_buffers.size(); i++) {
       model_outputs[i] = compiled_engine->output_buffers[i].clone();
     }
   } else {
diff --git a/tests/cpp/test_compiled_modules.cpp b/tests/cpp/test_compiled_modules.cpp
@@ -5,7 +5,11 @@ TEST_P(CppAPITests, CompiledModuleIsClose) {
   std::vector<torch::jit::IValue> trt_inputs_ivalues;
   std::vector<torch_tensorrt::Input> shapes;
   for (uint64_t i = 0; i < input_shapes.size(); i++) {
-    auto in = at::randint(5, input_shapes[i], {at::kCUDA}).to(input_types[i]);
+    auto in = at::randn(input_shapes[i], {at::kCUDA}).to(input_types[i]);
+    if (input_types[i] == at::kInt || input_types[i] == at::kLong) {
+      auto in = at::randint(0, 2, input_shapes[i], {at::kCUDA}).to(input_types[i]);
+    }
+
     jit_inputs_ivalues.push_back(in.clone());
     trt_inputs_ivalues.push_back(in.clone());
     auto in_spec = torch_tensorrt::Input(input_shapes[i]);
diff --git a/tests/py/ts/models/test_models.py b/tests/py/ts/models/test_models.py
@@ -93,7 +93,7 @@ def test_efficientnet_b0(self):
         )
 
     def test_bert_base_uncased(self):
-        self.model = cm.BertModule().cuda()
+        self.model = cm.BertModule()
         self.input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")
 
         compile_spec = {
@@ -116,7 +116,7 @@ def test_bert_base_uncased(self):
             "enabled_precisions": {torch.float},
             "truncate_long_and_double": True,
         }
-        with torchtrt.logging.errors():
+        with torchtrt.logging.debug():
             trt_mod = torchtrt.ts.compile(self.model, **compile_spec)
 
         model_outputs = self.model(self.input, self.input)

Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,7 @@ def test_efficientnet_b0(self):`
`93`	`93`	`)`
`94`	`94`
`95`	`95`	`def test_bert_base_uncased(self):`
`96`		`- self.model = cm.BertModule().cuda()`
	`96`	`+ self.model = cm.BertModule()`
`97`	`97`	`self.input = torch.randint(0, 2, (1, 14), dtype=torch.int32).to("cuda")`
`98`	`98`
`99`	`99`	`compile_spec = {`
`@@ -116,7 +116,7 @@ def test_bert_base_uncased(self):`
`116`	`116`	`"enabled_precisions": {torch.float},`
`117`	`117`	`"truncate_long_and_double": True,`
`118`	`118`	`}`
`119`		`- with torchtrt.logging.errors():`
	`119`	`+ with torchtrt.logging.debug():`
`120`	`120`	`trt_mod = torchtrt.ts.compile(self.model, **compile_spec)`
`121`	`121`
`122`	`122`	`model_outputs = self.model(self.input, self.input)`