Skip to content

feat(//core)!: Added support for Device meta data serialization and d… #484

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jul 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions core/compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,11 @@ void AddEngineToGraph(
torch::jit::script::Module mod,
std::shared_ptr<torch::jit::Graph>& g,
const std::string& serialized_engine,
runtime::CudaDevice& device_info,
std::string engine_id = "",
bool fallback = false) {
auto engine_ptr = c10::make_intrusive<runtime::TRTEngine>(mod._ivalue()->name() + engine_id, serialized_engine);
auto engine_ptr =
c10::make_intrusive<runtime::TRTEngine>(mod._ivalue()->name() + engine_id, serialized_engine, device_info);
// Get required metadata about the engine out
auto num_io = engine_ptr->num_io;
auto name = engine_ptr->name;
Expand Down Expand Up @@ -220,7 +222,9 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
convert_cfg.input_ranges = input_ranges;
auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params);
auto temp_g = std::make_shared<torch::jit::Graph>();
AddEngineToGraph(new_mod, temp_g, engine, trt_engine_id.str(), true);
auto device_spec = convert_cfg.engine_settings.device;
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true);

seg_block.update_graph(temp_g);
AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g);
Expand Down Expand Up @@ -260,7 +264,9 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
if (method.name().compare("forward") == 0) {
auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg);
auto new_g = std::make_shared<torch::jit::Graph>();
AddEngineToGraph(new_mod, new_g, engine);
auto device_spec = cfg.convert_info.engine_settings.device;
auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
AddEngineToGraph(new_mod, new_g, engine, cuda_device);
auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
new_mod.type()->addMethod(new_method);
Expand All @@ -271,12 +277,12 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
return new_mod;
}

torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine) {
torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device) {
std::ostringstream engine_id;
engine_id << reinterpret_cast<const int*>(&engine);
torch::jit::script::Module new_mod("tensorrt_engine_mod_" + engine_id.str());
auto new_g = std::make_shared<torch::jit::Graph>();
AddEngineToGraph(new_mod, new_g, engine);
AddEngineToGraph(new_mod, new_g, engine, cuda_device);
auto new_method = new_mod._ivalue()->compilation_unit()->create_function("forward", new_g);
auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
new_mod.type()->addMethod(new_method);
Expand Down
3 changes: 2 additions & 1 deletion core/compiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "core/conversion/conversion.h"
#include "core/ir/ir.h"
#include "core/partitioning/partitioning.h"
#include "core/runtime/runtime.h"
#include "torch/csrc/jit/api/module.h"

namespace trtorch {
Expand All @@ -22,7 +23,7 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::

torch::jit::script::Module CompileGraph(const torch::jit::script::Module& module, CompileSpec cfg);

torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine);
torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device);

void set_device(const int gpu_id);

Expand Down
3 changes: 3 additions & 0 deletions core/runtime/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,11 @@ config_setting(
cc_library(
name = "runtime",
srcs = [
"CudaDevice.cpp",
"DeviceList.cpp",
"TRTEngine.cpp",
"register_trt_op.cpp",
"runtime.cpp"
],
hdrs = [
"runtime.h",
Expand Down
106 changes: 106 additions & 0 deletions core/runtime/CudaDevice.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
#include "cuda_runtime.h"

#include "core/runtime/runtime.h"
#include "core/util/prelude.h"

namespace trtorch {
namespace core {
namespace runtime {

const std::string DEVICE_INFO_DELIM = "%";

typedef enum { ID_IDX = 0, SM_MAJOR_IDX, SM_MINOR_IDX, DEVICE_TYPE_IDX, DEVICE_NAME_IDX } SerializedDeviceInfoIndex;

CudaDevice::CudaDevice() : id{-1}, major{-1}, minor{-1}, device_type{nvinfer1::DeviceType::kGPU} {}

CudaDevice::CudaDevice(int64_t gpu_id, nvinfer1::DeviceType device_type) {
CudaDevice cuda_device;
cudaDeviceProp device_prop;

// Device ID
this->id = gpu_id;

// Get Device Properties
cudaGetDeviceProperties(&device_prop, gpu_id);

// Compute capability major version
this->major = device_prop.major;

// Compute capability minor version
this->minor = device_prop.minor;

std::string device_name(device_prop.name);

// Set Device name
this->device_name = device_name;

// Set Device Type
this->device_type = device_type;
}

// NOTE: Serialization Format for Device Info:
// id%major%minor%(enum)device_type%device_name

CudaDevice::CudaDevice(std::string device_info) {
LOG_DEBUG("Deserializing Device Info: " << device_info);

std::vector<std::string> tokens;
int64_t start = 0;
int64_t end = device_info.find(DEVICE_INFO_DELIM);

while (end != -1) {
tokens.push_back(device_info.substr(start, end - start));
start = end + DEVICE_INFO_DELIM.size();
end = device_info.find(DEVICE_INFO_DELIM, start);
}
tokens.push_back(device_info.substr(start, end - start));

TRTORCH_CHECK(tokens.size() == DEVICE_NAME_IDX + 1, "Unable to deserializable program target device infomation");

id = std::stoi(tokens[ID_IDX]);
major = std::stoi(tokens[SM_MAJOR_IDX]);
minor = std::stoi(tokens[SM_MINOR_IDX]);
device_type = (nvinfer1::DeviceType)(std::stoi(tokens[DEVICE_TYPE_IDX]));
device_name = tokens[DEVICE_NAME_IDX];

LOG_DEBUG("Deserialized Device Info: " << *this);
}

std::string CudaDevice::serialize() {
std::vector<std::string> content;
content.resize(DEVICE_NAME_IDX + 1);

content[ID_IDX] = std::to_string(id);
content[SM_MAJOR_IDX] = std::to_string(major);
content[SM_MINOR_IDX] = std::to_string(minor);
content[DEVICE_TYPE_IDX] = std::to_string((int64_t)device_type);
content[DEVICE_NAME_IDX] = device_name;

std::stringstream ss;
for (size_t i = 0; i < content.size() - 1; i++) {
ss << content[i] << DEVICE_INFO_DELIM;
}
ss << content[DEVICE_NAME_IDX];

std::string serialized_device_info = ss.str();

LOG_DEBUG("Serialized Device Info: " << serialized_device_info);

return serialized_device_info;
}

std::string CudaDevice::getSMCapability() const {
std::stringstream ss;
ss << major << "." << minor;
return ss.str();
}

std::ostream& operator<<(std::ostream& os, const CudaDevice& device) {
os << "Device(ID: " << device.id << ", Name: " << device.device_name << ", SM Capability: " << device.major << '.'
<< device.minor << ", Type: " << device.device_type << ')';
return os;
}

} // namespace runtime
} // namespace core
} // namespace trtorch
45 changes: 45 additions & 0 deletions core/runtime/DeviceList.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#include "cuda_runtime.h"

#include "core/runtime/runtime.h"
#include "core/util/prelude.h"

namespace trtorch {
namespace core {
namespace runtime {

DeviceList::DeviceList() {
int num_devices = 0;
auto status = cudaGetDeviceCount(&num_devices);
TRTORCH_ASSERT((status == cudaSuccess), "Unable to read CUDA capable devices. Return status: " << status);
for (int i = 0; i < num_devices; i++) {
device_list[i] = CudaDevice(i, nvinfer1::DeviceType::kGPU);
}

// REVIEW: DO WE CARE ABOUT DLA?

LOG_DEBUG("Runtime:\n Available CUDA Devices: \n" << this->dump_list());
}

void DeviceList::insert(int device_id, CudaDevice cuda_device) {
device_list[device_id] = cuda_device;
}

CudaDevice DeviceList::find(int device_id) {
return device_list[device_id];
}

DeviceList::DeviceMap DeviceList::get_devices() {
return device_list;
}

std::string DeviceList::dump_list() {
std::stringstream ss;
for (auto it = device_list.begin(); it != device_list.end(); ++it) {
ss << " " << it->second << std::endl;
}
return ss.str();
}

} // namespace runtime
} // namespace core
} // namespace trtorch
55 changes: 46 additions & 9 deletions core/runtime/TRTEngine.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <algorithm>

#include <cuda_runtime.h>
#include "NvInfer.h"
#include "torch/csrc/jit/frontend/function_schema_parser.h"

Expand All @@ -10,30 +11,55 @@ namespace trtorch {
namespace core {
namespace runtime {

typedef enum { ABI_TARGET_IDX = 0, DEVICE_IDX, ENGINE_IDX } SerializedInfoIndex;

std::string slugify(std::string s) {
std::replace(s.begin(), s.end(), '.', '_');
return s;
}

TRTEngine::TRTEngine(std::string serialized_engine)
TRTEngine::TRTEngine(std::string serialized_engine, CudaDevice cuda_device)
: logger(
std::string("[] - "),
util::logging::get_logger().get_reportable_severity(),
util::logging::get_logger().get_is_colored_output_on()) {
std::string _name = "deserialized_trt";
new (this) TRTEngine(_name, serialized_engine);
new (this) TRTEngine(_name, serialized_engine, cuda_device);
}

TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine)
TRTEngine::TRTEngine(std::vector<std::string> serialized_info)
: logger(
std::string("[] = "),
util::logging::get_logger().get_reportable_severity(),
util::logging::get_logger().get_is_colored_output_on()) {
TRTORCH_CHECK(
serialized_info.size() == ENGINE_IDX + 1, "Program to be deserialized targets an incompatible TRTorch ABI");
TRTORCH_CHECK(
serialized_info[ABI_TARGET_IDX] == ABI_VERSION,
"Program to be deserialized targets a different TRTorch ABI Version ("
<< serialized_info[ABI_TARGET_IDX] << ") than the TRTorch Runtime ABI (" << ABI_VERSION << ")");
std::string _name = "deserialized_trt";
std::string engine_info = serialized_info[ENGINE_IDX];

CudaDevice cuda_device = deserialize_device(serialized_info[DEVICE_IDX]);
new (this) TRTEngine(_name, engine_info, cuda_device);
}

TRTEngine::TRTEngine(std::string mod_name, std::string serialized_engine, CudaDevice cuda_device)
: logger(
std::string("[") + mod_name + std::string("_engine] - "),
util::logging::get_logger().get_reportable_severity(),
util::logging::get_logger().get_is_colored_output_on()) {
device_info = cuda_device;
set_cuda_device(device_info);

rt = nvinfer1::createInferRuntime(logger);

name = slugify(mod_name) + "_engine";

cuda_engine = rt->deserializeCudaEngine(serialized_engine.c_str(), serialized_engine.size());
TRTORCH_CHECK((cuda_engine != nullptr), "Unable to deserialize the TensorRT engine");

// Easy way to get a unique name for each engine, maybe there is a more
// descriptive way (using something associated with the graph maybe)
id = reinterpret_cast<EngineID>(cuda_engine);
Expand Down Expand Up @@ -63,6 +89,7 @@ TRTEngine& TRTEngine::operator=(const TRTEngine& other) {
id = other.id;
rt = other.rt;
cuda_engine = other.cuda_engine;
device_info = other.device_info;
exec_ctx = other.exec_ctx;
num_io = other.num_io;
return (*this);
Expand All @@ -85,18 +112,28 @@ TRTEngine::~TRTEngine() {
namespace {
static auto TRTORCH_UNUSED TRTEngineTSRegistrtion =
torch::class_<TRTEngine>("tensorrt", "Engine")
.def(torch::init<std::string>())
.def(torch::init<std::vector<std::string>>())
// TODO: .def("__call__", &TRTEngine::Run)
// TODO: .def("run", &TRTEngine::Run)
.def_pickle(
[](const c10::intrusive_ptr<TRTEngine>& self) -> std::string {
auto serialized_engine = self->cuda_engine->serialize();
return std::string((const char*)serialized_engine->data(), serialized_engine->size());
[](const c10::intrusive_ptr<TRTEngine>& self) -> std::vector<std::string> {
// Serialize TensorRT engine
auto serialized_trt_engine = self->cuda_engine->serialize();

// Adding device info related meta data to the serialized file
auto trt_engine = std::string((const char*)serialized_trt_engine->data(), serialized_trt_engine->size());

std::vector<std::string> serialize_info;
serialize_info.push_back(ABI_VERSION);
serialize_info.push_back(serialize_device(self->device_info));
serialize_info.push_back(trt_engine);
return serialize_info;
},
[](std::string seralized_engine) -> c10::intrusive_ptr<TRTEngine> {
return c10::make_intrusive<TRTEngine>(std::move(seralized_engine));
[](std::vector<std::string> seralized_info) -> c10::intrusive_ptr<TRTEngine> {
return c10::make_intrusive<TRTEngine>(std::move(seralized_info));
});
} // namespace

} // namespace runtime
} // namespace core
} // namespace trtorch
Loading