chore: update streams

peri044 · peri044 · commit d16585f0be1c · 2024-04-12T16:26:37.000-07:00
diff --git a/core/runtime/execute_engine.cpp b/core/runtime/execute_engine.cpp
@@ -178,8 +178,7 @@ std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intr
       enqueue_profiler_guard =
           std::make_unique<torch::autograd::profiler::RecordProfile>(compiled_engine->enqueue_profile_path);
     }
-    c10::cuda::CUDAStream stream = c10::cuda::getCurrentCUDAStream(inputs[0].device().index());
-
+    c10::cuda::CUDAStream stream = c10::cuda::getStreamFromPool(/*isHighPriority=*/true, inputs[0].device().index());
     // nvinfer1::IExecutionContext::enqueue is not thread safe and we need a mutex for it.
     std::unique_lock<std::mutex> lock(compiled_engine->mu);
     compiled_engine->exec_ctx->enqueueV3(stream);
diff --git a/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_PythonTorchTensorRTModule.py
@@ -59,7 +59,7 @@ def _initialize(self) -> None:
         runtime = trt.Runtime(logger)
         self.engine = runtime.deserialize_cuda_engine(self.engine)
         self.context = self.engine.create_execution_context()
-
+        self.stream = torch.cuda.Stream(torch.cuda.current_device())
         # Indices of inputs/outputs in the trt engine bindings, in the order
         # as they are in the original PyTorch model.
 
@@ -286,7 +286,7 @@ def forward(self, *inputs: torch.Tensor) -> torch.Tensor | Tuple[torch.Tensor, .
                 if self.profiling_enabled
                 else nullcontext()
             ):
-                self.context.execute_async_v3(torch.cuda.current_stream().cuda_stream)
+                self.context.execute_async_v3(self.stream.cuda_stream)
 
             if len(outputs) == 1:
                 return outputs[0]