pytorch
diff --git a/‎.github/workflows/build-test-linux.yml
Lines changed: 6 additions & 4 deletions b/‎.github/workflows/build-test-linux.yml
Lines changed: 6 additions & 4 deletions
diff --git a/‎.github/workflows/build-test-windows.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/build-test-windows.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎core/conversion/var/Var.cpp
Lines changed: 2 additions & 2 deletions b/‎core/conversion/var/Var.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/runtime/TRTEngine.cpp
Lines changed: 1 addition & 9 deletions b/‎core/runtime/TRTEngine.cpp
Lines changed: 1 addition & 9 deletions
diff --git a/‎core/runtime/TRTEngine.h
Lines changed: 3 additions & 1 deletion b/‎core/runtime/TRTEngine.h
Lines changed: 3 additions & 1 deletion
@@ -8,9 +8,9 @@ on:
       - nightly
       - release/*
     tags:
-        # NOTE: Binary build pipelines should only get triggered on release candidate builds
-        # Release candidate tags look like: v1.11.0-rc1
-        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
@@ -229,7 +229,9 @@ jobs:
         export USE_HOST_DEPS=1
         pushd .
         cd tests/py/dynamo
-        python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/
+        python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml --ignore test_cudagraphs_py.py --ignore test_cudagraphs_cpp.py runtime/
+        python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results_cuda_graphs_cpp.xml runtime/test_cudagraphs_cpp.py
+        python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results_cuda_graphs_py.xml runtime/test_cudagraphs_py.py
         python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
         python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
         popd
 
@@ -8,9 +8,9 @@ on:
       - nightly
       - release/*
     tags:
-        # NOTE: Binary build pipelines should only get triggered on release candidate builds
-        # Release candidate tags look like: v1.11.0-rc1
-        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+      # NOTE: Binary build pipelines should only get triggered on release candidate builds
+      # Release candidate tags look like: v1.11.0-rc1
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
   workflow_dispatch:
 
 jobs:
@@ -219,7 +219,7 @@ jobs:
         export USE_HOST_DEPS=1
         pushd .
         cd tests/py/dynamo
-        python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/
+        python -m pytest --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_runtime_test_results.xml runtime/
         python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_partitioning_test_results.xml partitioning/
         python -m pytest -n 4 --junitxml=${RUNNER_TEST_RESULTS_DIR}/tests_py_dynamo_core_lowering_test_results.xml lowering/
         popd
 
@@ -153,7 +153,7 @@ bool Var::isITensorList() {
   // Unpack the Var as a List and check if each entry is a custom class since
   // ITensors are stored in CustomClassHolder
   auto ival_list = ptr_.ivalue->toList();
-  for (int i = 0; i < ival_list.size(); i++) {
+  for (size_t i = 0; i < ival_list.size(); i++) {
     if (!ival_list.get(i).isCustomClass()) {
       return false;
     }
@@ -167,7 +167,7 @@ std::vector<nvinfer1::ITensor*> Var::unwrapToITensorList() {
   TORCHTRT_CHECK(isITensorList(), "Expected IValue to be an ITensorList");
   auto ivalue_list = ptr_.ivalue->toList();
   std::vector<nvinfer1::ITensor*> outputs;
-  for (int i = 0; i < ivalue_list.size(); i++) {
+  for (size_t i = 0; i < ivalue_list.size(); i++) {
     auto element = ivalue_list.get(i).toCustomClass<TensorContainer>()->tensor();
     outputs.push_back(std::move(element));
   }
 
@@ -71,15 +71,6 @@ TRTEngine::TRTEngine(
   multi_gpu_device_check();
   set_rt_device(device_info);
 
-  // Set active stream to non-default stream
-  auto current_stream = c10::cuda::getCurrentCUDAStream(device_info.id);
-  if (current_stream == c10::cuda::getDefaultCUDAStream(device_info.id)) {
-    active_stream = c10::cuda::getStreamFromPool(false, device_info.id);
-    c10::cuda::setCurrentCUDAStream(active_stream);
-  } else {
-    active_stream = current_stream;
-  }
-
   rt = make_trt(nvinfer1::createInferRuntime(util::logging::get_logger()));
 
   name = slugify(mod_name);
@@ -253,6 +244,7 @@ void TRTEngine::set_profiling_paths() {
   enqueue_profile_path = std::filesystem::path{profile_path_prefix + "/" + name + "_enqueue_profile.trace"}.string();
   trt_engine_profile_path =
       std::filesystem::path{profile_path_prefix + "/" + name + "_engine_exectuion_profile.trace"}.string();
+  cuda_graph_debug_path = std::filesystem::path{profile_path_prefix + "/" + name + "_cuda_graph.dot"}.string();
 }
 
 std::string TRTEngine::to_str() const {
 
@@ -70,7 +70,8 @@ struct TRTEngine : torch::CustomClassHolder {
 
   // CUDAGraph-Related Functionality
   at::cuda::CUDAGraph cudagraph = {};
-  at::cuda::CUDAStream active_stream = c10::cuda::getDefaultCUDAStream();
+  at::cuda::CUDAStream engine_stream = c10::cuda::getDefaultCUDAStream();
+  at::cuda::CUDAStream caller_stream = c10::cuda::getDefaultCUDAStream();
   std::vector<at::Tensor> input_buffers = {};
   std::vector<at::Tensor> output_buffers = {};
   std::string shape_key;
@@ -89,6 +90,7 @@ struct TRTEngine : torch::CustomClassHolder {
   std::string output_profile_path;
   std::string enqueue_profile_path;
   std::string trt_engine_profile_path;
+  std::string cuda_graph_debug_path;
   std::mutex mu;
   std::unique_ptr<TRTEngineProfiler> trt_engine_profiler;
 };