pytorch · narendasan · Jul 12, 2021 · May 25, 2021 · May 25, 2021 · May 27, 2021
diff --git a/core/compiler.cpp b/core/compiler.cpp
@@ -31,9 +31,11 @@ void AddEngineToGraph(
     torch::jit::script::Module mod,
     std::shared_ptr<torch::jit::Graph>& g,
     const std::string& serialized_engine,
+    runtime::CudaDevice& device_info,
     std::string engine_id = "",
     bool fallback = false) {
-  auto engine_ptr = c10::make_intrusive<runtime::TRTEngine>(mod._ivalue()->name() + engine_id, serialized_engine);
+  auto engine_ptr =
+      c10::make_intrusive<runtime::TRTEngine>(mod._ivalue()->name() + engine_id, serialized_engine, device_info);
   // Get required metadata about the engine out
   auto num_io = engine_ptr->num_io;
   auto name = engine_ptr->name;
@@ -220,7 +222,9 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
           convert_cfg.input_ranges = input_ranges;
           auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params);
           auto temp_g = std::make_shared<torch::jit::Graph>();
-          AddEngineToGraph(new_mod, temp_g, engine, trt_engine_id.str(), true);
+          auto device_spec = convert_cfg.engine_settings.device;
+          auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+          AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true);
 
           seg_block.update_graph(temp_g);
           AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g);
@@ -260,7 +264,9 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
     if (method.name().compare("forward") == 0) {
       auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg);
       auto new_g = std::make_shared<torch::jit::Graph>();
-      AddEngineToGraph(new_mod, new_g, engine);
+      auto device_spec = cfg.convert_info.engine_settings.device;
+      auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+      AddEngineToGraph(new_mod, new_g, engine, cuda_device);
       auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
       auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
       new_mod.type()->addMethod(new_method);
@@ -271,12 +277,12 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
   return new_mod;
 }
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine) {
+torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device) {
   std::ostringstream engine_id;
   engine_id << reinterpret_cast<const int*>(&engine);
   torch::jit::script::Module new_mod("tensorrt_engine_mod_" + engine_id.str());
   auto new_g = std::make_shared<torch::jit::Graph>();
-  AddEngineToGraph(new_mod, new_g, engine);
+  AddEngineToGraph(new_mod, new_g, engine, cuda_device);
   auto new_method = new_mod._ivalue()->compilation_unit()->create_function("forward", new_g);
   auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
   new_mod.type()->addMethod(new_method);

diff --git a/core/compiler.h b/core/compiler.h
@@ -5,6 +5,7 @@
 #include "core/conversion/conversion.h"
 #include "core/ir/ir.h"
 #include "core/partitioning/partitioning.h"
+#include "core/runtime/runtime.h"
 #include "torch/csrc/jit/api/module.h"
 
 namespace trtorch {
@@ -22,7 +23,7 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
 
 torch::jit::script::Module CompileGraph(const torch::jit::script::Module& module, CompileSpec cfg);
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine);
+torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device);
 
 void set_device(const int gpu_id);
 

diff --git a/core/runtime/BUILD b/core/runtime/BUILD
@@ -10,8 +10,11 @@ config_setting(
 cc_library(
     name = "runtime",
     srcs = [
+        "CudaDevice.cpp",
+        "DeviceList.cpp",
         "TRTEngine.cpp",
         "register_trt_op.cpp",
+        "runtime.cpp"
     ],
     hdrs = [
         "runtime.h",

diff --git a/core/runtime/CudaDevice.cpp b/core/runtime/CudaDevice.cpp
@@ -0,0 +1,106 @@
+#include "cuda_runtime.h"
+
+#include "core/runtime/runtime.h"
+#include "core/util/prelude.h"
+
+namespace trtorch {
+namespace core {
+namespace runtime {
+
+const std::string DEVICE_INFO_DELIM = "%";
+
+typedef enum { ID_IDX = 0, SM_MAJOR_IDX, SM_MINOR_IDX, DEVICE_TYPE_IDX, DEVICE_NAME_IDX } SerializedDeviceInfoIndex;
+
+CudaDevice::CudaDevice() : id{-1}, major{-1}, minor{-1}, device_type{nvinfer1::DeviceType::kGPU} {}
+
+CudaDevice::CudaDevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
+  CudaDevice cuda_device;
+  cudaDeviceProp device_prop;
+
+  // Device ID
+  this->id = gpu_id;
+
+  // Get Device Properties
+  cudaGetDeviceProperties(&device_prop, gpu_id);
+
+  // Compute capability major version
+  this->major = device_prop.major;
+
+  // Compute capability minor version
+  this->minor = device_prop.minor;
+
+  std::string device_name(device_prop.name);
+
+  // Set Device name
+  this->device_name = device_name;
+
+  // Set Device Type
+  this->device_type = device_type;
+}
+
+// NOTE: Serialization Format for Device Info:
+// id%major%minor%(enum)device_type%device_name
+
+CudaDevice::CudaDevice(std::string device_info) {
+  LOG_DEBUG("Deserializing Device Info: " << device_info);
+
+  std::vector<std::string> tokens;
+  int64_t start = 0;
+  int64_t end = device_info.find(DEVICE_INFO_DELIM);
+
+  while (end != -1) {
+    tokens.push_back(device_info.substr(start, end - start));
+    start = end + DEVICE_INFO_DELIM.size();
+    end = device_info.find(DEVICE_INFO_DELIM, start);
+  }
+  tokens.push_back(device_info.substr(start, end - start));
+
+  TRTORCH_CHECK(tokens.size() == DEVICE_NAME_IDX + 1, "Unable to deserializable program target device infomation");
+
+  id = std::stoi(tokens[ID_IDX]);
+  major = std::stoi(tokens[SM_MAJOR_IDX]);
+  minor = std::stoi(tokens[SM_MINOR_IDX]);
+  device_type = (nvinfer1::DeviceType)(std::stoi(tokens[DEVICE_TYPE_IDX]));
+  device_name = tokens[DEVICE_NAME_IDX];
+
+  LOG_DEBUG("Deserialized Device Info: " << *this);
+}
+
+std::string CudaDevice::serialize() {
+  std::vector<std::string> content;
+  content.resize(DEVICE_NAME_IDX + 1);
+
+  content[ID_IDX] = std::to_string(id);
+  content[SM_MAJOR_IDX] = std::to_string(major);
+  content[SM_MINOR_IDX] = std::to_string(minor);
+  content[DEVICE_TYPE_IDX] = std::to_string((int64_t)device_type);
+  content[DEVICE_NAME_IDX] = device_name;
+
+  std::stringstream ss;
+  for (size_t i = 0; i < content.size() - 1; i++) {
+    ss << content[i] << DEVICE_INFO_DELIM;
+  }
+  ss << content[DEVICE_NAME_IDX];
+
+  std::string serialized_device_info = ss.str();
+
+  LOG_DEBUG("Serialized Device Info: " << serialized_device_info);
+
+  return serialized_device_info;
+}
+
+std::string CudaDevice::getSMCapability() const {
+  std::stringstream ss;
+  ss << major << "." << minor;
+  return ss.str();
+}
+
+std::ostream& operator<<(std::ostream& os, const CudaDevice& device) {
+  os << "Device(ID: " << device.id << ", Name: " << device.device_name << ", SM Capability: " << device.major << '.'
+     << device.minor << ", Type: " << device.device_type << ')';
+  return os;
+}
+
+} // namespace runtime
+} // namespace core
+} // namespace trtorch
diff --git a/core/runtime/DeviceList.cpp b/core/runtime/DeviceList.cpp
@@ -0,0 +1,45 @@
+#include "cuda_runtime.h"
+
+#include "core/runtime/runtime.h"
+#include "core/util/prelude.h"
+
+namespace trtorch {
+namespace core {
+namespace runtime {
+
+DeviceList::DeviceList() {
+  int num_devices = 0;
+  auto status = cudaGetDeviceCount(&num_devices);
+  TRTORCH_ASSERT((status == cudaSuccess), "Unable to read CUDA capable devices. Return status: " << status);
+  for (int i = 0; i < num_devices; i++) {
+    device_list[i] = CudaDevice(i, nvinfer1::DeviceType::kGPU);
+  }
+
+  // REVIEW: DO WE CARE ABOUT DLA?
+
+  LOG_DEBUG("Runtime:\n Available CUDA Devices: \n" << this->dump_list());
+}
+
+void DeviceList::insert(int device_id, CudaDevice cuda_device) {
+  device_list[device_id] = cuda_device;
+}
+
+CudaDevice DeviceList::find(int device_id) {
+  return device_list[device_id];
+}
+
+DeviceList::DeviceMap DeviceList::get_devices() {
+  return device_list;
+}
+
+std::string DeviceList::dump_list() {
+  std::stringstream ss;
+  for (auto it = device_list.begin(); it != device_list.end(); ++it) {
+    ss << "    " << it->second << std::endl;
+  }
+  return ss.str();
+}
+
+} // namespace runtime
+} // namespace core
+} // namespace trtorch
diff --git a/core/runtime/TRTEngine.cpp b/core/runtime/TRTEngine.cpp
@@ -1,5 +1,6 @@
 #include <algorithm>
 
+#include <cuda_runtime.h>
 #include "NvInfer.h"
 #include "torch/csrc/jit/frontend/function_schema_parser.h"
 
@@ -10,30 +11,55 @@ namespace trtorch {
 namespace core {
 namespace runtime {
 
+typedef enum { ABI_TARGET_IDX = 0, DEVICE_IDX, ENGINE_IDX } SerializedInfoIndex;
+
 std::string slugify(std::string s) {
   std::replace(s.begin(), s.end(), '.', '_');
   return s;
 }
 
-TRTEngine::TRTEngine(std::string serialized_engine)
+TRTEngine::TRTEngine(std::string serialized_engine, CudaDevice cuda_device)
     : logger(
           std::string("[] - "),
           util::logging::get_logger().get_reportable_severity(),
           util::logging::get_logger().get_is_colored_output_on()) {
   std::string _name = "deserialized_trt";
-  new (this) TRTEngine(_name, serialized_engine);
+  new (this) TRTEngine(_name, serialized_engine, cuda_device);
 }
 
-TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine)
+TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
+    : logger(
+          std::string("[] = "),
+          util::logging::get_logger().get_reportable_severity(),
+          util::logging::get_logger().get_is_colored_output_on()) {
+  TRTORCH_CHECK(
+      serialized_info.size() == ENGINE_IDX + 1, "Program to be deserialized targets an incompatible TRTorch ABI");
+  TRTORCH_CHECK(
+      serialized_info[ABI_TARGET_IDX] == ABI_VERSION,
+      "Program to be deserialized targets a different TRTorch ABI Version ("
+          << serialized_info[ABI_TARGET_IDX] << ") than the TRTorch Runtime ABI (" << ABI_VERSION << ")");
+  std::string _name = "deserialized_trt";
+  std::string engine_info = serialized_info[ENGINE_IDX];
+
+  CudaDevice cuda_device = deserialize_device(serialized_info[DEVICE_IDX]);
+  new (this) TRTEngine(_name, engine_info, cuda_device);
+}
+
+TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDevice cuda_device)
     : logger(
           std::string("[") + mod_name + std::string("_engine] - "),
           util::logging::get_logger().get_reportable_severity(),
           util::logging::get_logger().get_is_colored_output_on()) {
+  device_info = cuda_device;
+  set_cuda_device(device_info);
+
   rt = nvinfer1::createInferRuntime(logger);
 
   name = slugify(mod_name) + "_engine";
 
   cuda_engine = rt->deserializeCudaEngine(serialized_engine.c_str(), serialized_engine.size());
+  TRTORCH_CHECK((cuda_engine != nullptr), "Unable to deserialize the TensorRT engine");
+
   // Easy way to get a unique name for each engine, maybe there is a more
   // descriptive way (using something associated with the graph maybe)
   id = reinterpret_cast<EngineID>(cuda_engine);
@@ -63,6 +89,7 @@ TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
   id = other.id;
   rt = other.rt;
   cuda_engine = other.cuda_engine;
+  device_info = other.device_info;
   exec_ctx = other.exec_ctx;
   num_io = other.num_io;
   return (*this);
@@ -85,18 +112,28 @@ TRTEngine::~TRTEngine() {
 namespace {
 static auto TRTORCH_UNUSED TRTEngineTSRegistrtion =
     torch::class_<TRTEngine>("tensorrt", "Engine")
-        .def(torch::init<std::string>())
+        .def(torch::init<std::vector<std::string>>())
         // TODO: .def("__call__", &TRTEngine::Run)
         // TODO: .def("run", &TRTEngine::Run)
         .def_pickle(
-            [](const c10::intrusive_ptr<TRTEngine>& self) -> std::string {
-              auto serialized_engine = self->cuda_engine->serialize();
-              return std::string((const char*)serialized_engine->data(), serialized_engine->size());
+            [](const c10::intrusive_ptr<TRTEngine>& self) -> std::vector<std::string> {
+              // Serialize TensorRT engine
+              auto serialized_trt_engine = self->cuda_engine->serialize();
+
+              // Adding device info related meta data to the serialized file
+              auto trt_engine = std::string((const char*)serialized_trt_engine->data(), serialized_trt_engine->size());
+
+              std::vector<std::string> serialize_info;
+              serialize_info.push_back(ABI_VERSION);
+              serialize_info.push_back(serialize_device(self->device_info));
+              serialize_info.push_back(trt_engine);
+              return serialize_info;
             },
-            [](std::string seralized_engine) -> c10::intrusive_ptr<TRTEngine> {
-              return c10::make_intrusive<TRTEngine>(std::move(seralized_engine));
+            [](std::vector<std::string> seralized_info) -> c10::intrusive_ptr<TRTEngine> {
+              return c10::make_intrusive<TRTEngine>(std::move(seralized_info));
             });
 } // namespace
+
 } // namespace runtime
 } // namespace core
 } // namespace trtorch