pytorch
diff --git a/‎core/compiler.cpp
Lines changed: 13 additions & 5 deletions b/‎core/compiler.cpp
Lines changed: 13 additions & 5 deletions
diff --git a/‎core/compiler.h
Lines changed: 2 additions & 1 deletion b/‎core/compiler.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/runtime/TRTEngine.cpp
Lines changed: 152 additions & 9 deletions b/‎core/runtime/TRTEngine.cpp
Lines changed: 152 additions & 9 deletions
diff --git a/‎core/runtime/register_trt_op.cpp
Lines changed: 79 additions & 0 deletions b/‎core/runtime/register_trt_op.cpp
Lines changed: 79 additions & 0 deletions
@@ -31,9 +31,11 @@ void AddEngineToGraph(
     torch::jit::script::Module mod,
     std::shared_ptr<torch::jit::Graph>& g,
     const std::string& serialized_engine,
+    runtime::CudaDevice& device_info,
     std::string engine_id = "",
     bool fallback = false) {
-  auto engine_ptr = c10::make_intrusive<runtime::TRTEngine>(mod._ivalue()->name() + engine_id, serialized_engine);
+  auto engine_ptr =
+      c10::make_intrusive<runtime::TRTEngine>(mod._ivalue()->name() + engine_id, serialized_engine, device_info);
   // Get required metadata about the engine out
   auto num_io = engine_ptr->num_io;
   auto name = engine_ptr->name;
@@ -220,7 +222,9 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
           convert_cfg.input_ranges = input_ranges;
           auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params);
           auto temp_g = std::make_shared<torch::jit::Graph>();
-          AddEngineToGraph(new_mod, temp_g, engine, trt_engine_id.str(), true);
+          auto device_spec = convert_cfg.engine_settings.device;
+          auto cuda_device = runtime::get_device_info(device_spec.gpu_id, device_spec.device_type);
+          AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true);
 
           seg_block.update_graph(temp_g);
           AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g);
@@ -260,7 +264,9 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
     if (method.name().rfind("_", 0)) {
       auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg);
       auto new_g = std::make_shared<torch::jit::Graph>();
-      AddEngineToGraph(new_mod, new_g, engine);
+      auto device_spec = cfg.convert_info.engine_settings.device;
+      auto cuda_device = runtime::get_device_info(device_spec.gpu_id, device_spec.device_type);
+      AddEngineToGraph(new_mod, new_g, engine, cuda_device);
       auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
       auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
       new_mod.type()->addMethod(new_method);
@@ -271,12 +277,14 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
   return new_mod;
 }
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine) {
+torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, CompileSpec cfg) {
   std::ostringstream engine_id;
   engine_id << reinterpret_cast<const int*>(&engine);
   torch::jit::script::Module new_mod("tensorrt_engine_mod_" + engine_id.str());
   auto new_g = std::make_shared<torch::jit::Graph>();
-  AddEngineToGraph(new_mod, new_g, engine);
+  auto device_spec = cfg.convert_info.engine_settings.device;
+  auto cuda_device = runtime::get_device_info(device_spec.gpu_id, device_spec.device_type);
+  AddEngineToGraph(new_mod, new_g, engine, cuda_device);
   auto new_method = new_mod._ivalue()->compilation_unit()->create_function("forward", new_g);
   auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
   new_mod.type()->addMethod(new_method);
 
@@ -5,6 +5,7 @@
 #include "core/conversion/conversion.h"
 #include "core/ir/ir.h"
 #include "core/partitioning/partitioning.h"
+#include "core/runtime/runtime.h"
 #include "torch/csrc/jit/api/module.h"
 
 namespace trtorch {
@@ -22,7 +23,7 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
 
 torch::jit::script::Module CompileGraph(const torch::jit::script::Module& module, CompileSpec cfg);
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine);
+torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, CompileSpec cfg);
 
 void set_device(const int gpu_id);
 
 
@@ -1,5 +1,6 @@
 #include <algorithm>
 
+#include <cuda_runtime.h>
 #include "NvInfer.h"
 #include "torch/csrc/jit/frontend/function_schema_parser.h"
 
@@ -15,20 +16,35 @@ std::string slugify(std::string s) {
   return s;
 }
 
-TRTEngine::TRTEngine(std::string serialized_engine)
+TRTEngine::TRTEngine(std::string serialized_engine, CudaDevice cuda_device)
     : logger(
           std::string("[] - "),
           util::logging::get_logger().get_reportable_severity(),
           util::logging::get_logger().get_is_colored_output_on()) {
   std::string _name = "deserialized_trt";
-  new (this) TRTEngine(_name, serialized_engine);
+  new (this) TRTEngine(_name, serialized_engine, cuda_device);
 }
 
-TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine)
+TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
+    : logger(
+          std::string("[] = "),
+          util::logging::get_logger().get_reportable_severity(),
+          util::logging::get_logger().get_is_colored_output_on()) {
+  std::string _name = "deserialized_trt";
+  std::string engine_info = serialized_info[EngineIdx];
+
+  CudaDevice cuda_device = deserialize_device(serialized_info[DeviceIdx]);
+  new (this) TRTEngine(_name, engine_info, cuda_device);
+}
+
+TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDevice cuda_device)
     : logger(
           std::string("[") + mod_name + std::string("_engine] - "),
           util::logging::get_logger().get_reportable_severity(),
           util::logging::get_logger().get_is_colored_output_on()) {
+  device_info = cuda_device;
+  set_cuda_device(device_info);
+
   rt = nvinfer1::createInferRuntime(logger);
 
   name = slugify(mod_name) + "_engine";
@@ -63,6 +79,7 @@ TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
   id = other.id;
   rt = other.rt;
   cuda_engine = other.cuda_engine;
+  device_info = other.device_info;
   exec_ctx = other.exec_ctx;
   num_io = other.num_io;
   return (*this);
@@ -85,18 +102,144 @@ TRTEngine::~TRTEngine() {
 namespace {
 static auto TRTORCH_UNUSED TRTEngineTSRegistrtion =
     torch::class_<TRTEngine>("tensorrt", "Engine")
-        .def(torch::init<std::string>())
+        .def(torch::init<std::vector<std::string>>())
         // TODO: .def("__call__", &TRTEngine::Run)
         // TODO: .def("run", &TRTEngine::Run)
         .def_pickle(
-            [](const c10::intrusive_ptr<TRTEngine>& self) -> std::string {
-              auto serialized_engine = self->cuda_engine->serialize();
-              return std::string((const char*)serialized_engine->data(), serialized_engine->size());
+            [](const c10::intrusive_ptr<TRTEngine>& self) -> std::vector<std::string> {
+              // Serialize TensorRT engine
+              auto serialized_trt_engine = self->cuda_engine->serialize();
+
+              // Adding device info related meta data to the serialized file
+              auto trt_engine = std::string((const char*)serialized_trt_engine->data(), serialized_trt_engine->size());
+
+              std::vector<std::string> serialize_info;
+              serialize_info.push_back(serialize_device(self->device_info));
+              serialize_info.push_back(trt_engine);
+              return serialize_info;
             },
-            [](std::string seralized_engine) -> c10::intrusive_ptr<TRTEngine> {
-              return c10::make_intrusive<TRTEngine>(std::move(seralized_engine));
+            [](std::vector<std::string> seralized_info) -> c10::intrusive_ptr<TRTEngine> {
+              return c10::make_intrusive<TRTEngine>(std::move(seralized_info));
             });
 } // namespace
+void set_cuda_device(CudaDevice& cuda_device) {
+  TRTORCH_CHECK((cudaSetDevice(cuda_device.id) == cudaSuccess), "Unable to set device: " << cuda_device.id);
+}
+
+void get_cuda_device(CudaDevice& cuda_device) {
+  int device = 0;
+  TRTORCH_CHECK(
+      (cudaGetDevice(reinterpret_cast<int*>(&device)) == cudaSuccess),
+      "Unable to get current device: " << cuda_device.id);
+  cuda_device.id = static_cast<int64_t>(device);
+  cudaDeviceProp device_prop;
+  TRTORCH_CHECK(
+      (cudaGetDeviceProperties(&device_prop, cuda_device.id) == cudaSuccess),
+      "Unable to get CUDA properties from device:" << cuda_device.id);
+  cuda_device.set_major(device_prop.major);
+  cuda_device.set_minor(device_prop.minor);
+  std::string device_name(device_prop.name);
+  cuda_device.set_device_name(device_name);
+}
+
+std::string serialize_device(CudaDevice& cuda_device) {
+  void* buffer = new char[sizeof(cuda_device)];
+  void* ref_buf = buffer;
+
+  int64_t temp = cuda_device.get_id();
+  memcpy(buffer, reinterpret_cast<int64_t*>(&temp), sizeof(int64_t));
+  buffer = static_cast<char*>(buffer) + sizeof(int64_t);
+
+  temp = cuda_device.get_major();
+  memcpy(buffer, reinterpret_cast<int64_t*>(&temp), sizeof(int64_t));
+  buffer = static_cast<char*>(buffer) + sizeof(int64_t);
+
+  temp = cuda_device.get_minor();
+  memcpy(buffer, reinterpret_cast<int64_t*>(&temp), sizeof(int64_t));
+  buffer = static_cast<char*>(buffer) + sizeof(int64_t);
+
+  auto device_type = cuda_device.get_device_type();
+  memcpy(buffer, reinterpret_cast<char*>(&device_type), sizeof(nvinfer1::DeviceType));
+  buffer = static_cast<char*>(buffer) + sizeof(nvinfer1::DeviceType);
+
+  size_t device_name_len = cuda_device.get_device_name_len();
+  memcpy(buffer, reinterpret_cast<char*>(&device_name_len), sizeof(size_t));
+  buffer = static_cast<char*>(buffer) + sizeof(size_t);
+
+  auto device_name = cuda_device.get_device_name();
+  memcpy(buffer, reinterpret_cast<char*>(&device_name), device_name.size());
+  buffer = static_cast<char*>(buffer) + device_name.size();
+
+  return std::string((const char*)ref_buf, sizeof(int64_t) * 3 + sizeof(nvinfer1::DeviceType) + device_name.size());
+}
+
+CudaDevice deserialize_device(std::string device_info) {
+  CudaDevice ret;
+  char* buffer = new char[device_info.size() + 1];
+  std::copy(device_info.begin(), device_info.end(), buffer);
+  int64_t temp = 0;
+
+  memcpy(&temp, reinterpret_cast<char*>(buffer), sizeof(int64_t));
+  buffer += sizeof(int64_t);
+  ret.set_id(temp);
+
+  memcpy(&temp, reinterpret_cast<char*>(buffer), sizeof(int64_t));
+  buffer += sizeof(int64_t);
+  ret.set_major(temp);
+
+  memcpy(&temp, reinterpret_cast<char*>(buffer), sizeof(int64_t));
+  buffer += sizeof(int64_t);
+  ret.set_minor(temp);
+
+  nvinfer1::DeviceType device_type;
+  memcpy(&device_type, reinterpret_cast<char*>(buffer), sizeof(nvinfer1::DeviceType));
+  buffer += sizeof(nvinfer1::DeviceType);
+
+  size_t size;
+  memcpy(&size, reinterpret_cast<size_t*>(&buffer), sizeof(size_t));
+  buffer += sizeof(size_t);
+
+  ret.set_device_name_len(size);
+
+  std::string device_name;
+  memcpy(&device_name, reinterpret_cast<char*>(buffer), size * sizeof(char));
+  buffer += size * sizeof(char);
+
+  ret.set_device_name(device_name);
+
+  return ret;
+}
+
+CudaDevice get_device_info(int64_t gpu_id, nvinfer1::DeviceType device_type) {
+  CudaDevice cuda_device;
+  cudaDeviceProp device_prop;
+
+  // Device ID
+  cuda_device.set_id(gpu_id);
+
+  // Get Device Properties
+  cudaGetDeviceProperties(&device_prop, gpu_id);
+
+  // Compute capability major version
+  cuda_device.set_major(device_prop.major);
+
+  // Compute capability minor version
+  cuda_device.set_minor(device_prop.minor);
+
+  std::string device_name(device_prop.name);
+
+  // Set Device name
+  cuda_device.set_device_name(device_name);
+
+  // Set Device name len for serialization/deserialization
+  cuda_device.set_device_name_len(device_name.size());
+
+  // Set Device Type
+  cuda_device.set_device_type(device_type);
+
+  return cuda_device;
+}
+
 } // namespace runtime
 } // namespace core
 } // namespace trtorch
@@ -10,8 +10,87 @@ namespace trtorch {
 namespace core {
 namespace runtime {
 
+// Checks if the context switch requred for device ID
+bool is_switch_required(const CudaDevice& curr_device, const CudaDevice& conf_device) {
+  // If SM capability is not the same as configured then switch
+  if ((curr_device.major != conf_device.major) || (curr_device.minor != conf_device.minor)) {
+      LOG_WARNING("Configured SM capability does not match with current device ID. Switching context");
+      return true;
+  }
+
+  // GPU case
+  if (conf_device.device_type == nvinfer1::DeviceType::kGPU) {
+    if (curr_device.device_name != conf_device.device_name) {
+      LOG_WARNING("TRTEngine compiled for " << conf_device.device_name << " but current CUDA device is " << curr_device.device_name << ". Switching the device context");
+      return true;
+    }
+  }
+
+  if (curr_device.id != conf_device.id) {
+    LOG_WARNING("Configured Device ID: " << conf_device.id << " is different that current device ID: " << curr_device.id << ". Switching context");
+    return true;
+  }
+
+  return false;
+}
+
+int select_cuda_device(const CudaDevice& conf_device) {
+  int device_id = 0;
+  int num_devices = 0;
+  // SM Compute capability <major,minor> pair
+  std::unordered_map<std::string, std::string> dla_supported_SM;
+
+  // Xavier SM Compute Capability
+  dla_supported_SM.insert(std::make_pair("7.2", "Xavier"));
+  auto status = cudaGetDeviceCount(&num_devices);
+  TRTORCH_CHECK((status == cudaSuccess), "Unable to read CUDA capable devices. Return status: " << status);
+
+  cudaDeviceProp device_prop;
+
+  for (int i=0; i < num_devices; i++) {
+    TRTORCH_CHECK((cudaGetDeviceProperties(&device_prop, i) == cudaSuccess), "Unable to read CUDA Device Properies for device id: " << i);
+    auto compute_cap = std::to_string(device_prop.major) + "." + std::to_string(device_prop.minor);
+    std::string device_name{device_prop.name};
+    // In case of DLA select the DLA supported device ID
+    if (conf_device.device_type == nvinfer1::DeviceType::kDLA) {
+       if (dla_supported_SM.find(compute_cap) != dla_supported_SM.end() && dla_supported_SM[compute_cap] == device_name) {
+           device_id = i;
+           break;
+       }
+    }
+    else if (conf_device.device_type == nvinfer1::DeviceType::kGPU) {
+      auto conf_sm = std::to_string(conf_device.major) + "." + std::to_string(conf_device.minor);
+      if (compute_cap == conf_sm && device_name == conf_device.device_name) {
+        device_id = i;
+        break;
+      }
+    }
+    else {
+        LOG_ERROR("Unkown device type detected from the compiled engine");
+        break;
+    }
+  }
+  return device_id;
+}
+
 std::vector<at::Tensor> execute_engine(std::vector<at::Tensor> inputs, c10::intrusive_ptr<TRTEngine> compiled_engine) {
   LOG_DEBUG("Attempting to run engine (ID: " << compiled_engine->name << ")");
+
+  CudaDevice curr_device;
+  get_cuda_device(curr_device);
+
+  if (is_switch_required(curr_device, compiled_engine->device_info)) {
+    // Scan through available CUDA devices and set the CUDA device context correctly
+    CudaDevice device{.id = select_cuda_device(compiled_engine->device_info)};
+    set_cuda_device(device);
+
+    std::string target_device = "cuda:" + std::to_string(device.id);
+
+    for(auto& in : inputs) {
+      in = in.to(at::kCUDA);
+    }
+  }
+
   std::vector<void*> gpu_handles;
 
   std::vector<at::Tensor> contig_inputs{};