pytorch
diff --git a/‎.github/workflows/docgen.yml
Lines changed: 5 additions & 0 deletions b/‎.github/workflows/docgen.yml
Lines changed: 5 additions & 0 deletions
diff --git a/‎core/compiler.cpp
Lines changed: 17 additions & 11 deletions b/‎core/compiler.cpp
Lines changed: 17 additions & 11 deletions
diff --git a/‎core/compiler.h
Lines changed: 2 additions & 1 deletion b/‎core/compiler.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎core/conversion/conversion.cpp
Lines changed: 14 additions & 2 deletions b/‎core/conversion/conversion.cpp
Lines changed: 14 additions & 2 deletions
diff --git a/‎core/conversion/conversionctx/ConversionCtx.cpp
Lines changed: 7 additions & 5 deletions b/‎core/conversion/conversionctx/ConversionCtx.cpp
Lines changed: 7 additions & 5 deletions
diff --git a/‎core/conversion/converters/BUILD
Lines changed: 1 addition & 0 deletions b/‎core/conversion/converters/BUILD
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/conversion/converters/converter_util.cpp
Lines changed: 1 addition & 1 deletion b/‎core/conversion/converters/converter_util.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/conversion/converters/impl/constant_pad.cpp
Lines changed: 156 additions & 0 deletions b/‎core/conversion/converters/impl/constant_pad.cpp
Lines changed: 156 additions & 0 deletions
diff --git a/‎core/conversion/converters/impl/layer_norm.cpp
Lines changed: 24 additions & 5 deletions b/‎core/conversion/converters/impl/layer_norm.cpp
Lines changed: 24 additions & 5 deletions
@@ -17,6 +17,11 @@ jobs:
         username: $GITHUB_ACTOR
         password: ${{secrets.GITHUB_TOKEN}}
     steps:
+      - name: Reclaim space
+        run: |
+          rm -rf /usr/share/dotnet
+          rm -rf /opt/ghc
+          rm -rf "/usr/local/share/boost"
       - uses: actions/checkout@v2
         with:
           ref: ${{github.head_ref}}
 
@@ -32,9 +32,11 @@ void AddEngineToGraph(
     torch::jit::script::Module mod,
     std::shared_ptr<torch::jit::Graph>& g,
     const std::string& serialized_engine,
+    runtime::CudaDevice& device_info,
     std::string engine_id = "",
     bool fallback = false) {
-  auto engine_ptr = c10::make_intrusive<runtime::TRTEngine>(mod._ivalue()->name() + engine_id, serialized_engine);
+  auto engine_ptr =
+      c10::make_intrusive<runtime::TRTEngine>(mod._ivalue()->name() + engine_id, serialized_engine, device_info);
   // Get required metadata about the engine out
   auto num_io = engine_ptr->num_io;
   auto name = engine_ptr->name;
@@ -265,7 +267,9 @@ GraphAndMapping ConstructFallbackGraph(
       convert_cfg.input_ranges = input_ranges;
       auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params);
       auto temp_g = std::make_shared<torch::jit::Graph>();
-      AddEngineToGraph(new_mod, temp_g, engine, trt_engine_id.str(), true);
+      auto device_spec = convert_cfg.engine_settings.device;
+      auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+      AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true);
 
       seg_block.update_graph(temp_g);
       AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g);
@@ -302,15 +306,15 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
   torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
   std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
   for (const torch::jit::script::Method& method : mod.get_methods()) {
-    // Don't convert hidden methods
-    if (method.name().rfind("_", 0)) {
+    // Compile only forward methods. forward method contains the entire graph.
+    if (method.name().compare("forward") == 0) {
       auto new_g = std::make_shared<torch::jit::Graph>();
       auto graph_and_parameters = lowering::Lower(mod, method.name());
 
       auto g = graph_and_parameters.first;
       auto params = graph_and_parameters.second;
       auto named_params = conversion::get_named_params(g->inputs(), params);
-      LOG_INFO(*g << "(LoweringGraph)\n");
+      LOG_INFO("(LoweredGraph)\n" << *g);
 
       std::unordered_map<torch::jit::Value*, ir::InputRange> input_ranges;
       for (size_t i = 0; i < g->inputs().size(); ++i) {
@@ -319,7 +323,7 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
       auto input_ivalues_map = partitioning::generateRandomInputs(input_ranges);
       auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, named_params);
       new_g = graph_and_mapping.first;
-      LOG_INFO(*new_g << "(FallbackGraph)\n");
+      LOG_INFO("(FallbackGraph)\n" << *new_g);
 
       // if there is no tensorrt engine self in fallback graph, there is no conversion, we just return the initial
       // module
@@ -349,11 +353,13 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
   torch::jit::script::Module new_mod(mod._ivalue()->name() + "_trt");
   std::vector<std::shared_ptr<torch::jit::Graph>> graphs;
   for (const torch::jit::script::Method& method : mod.get_methods()) {
-    // Don't convert hidden methods
-    if (method.name().rfind("_", 0)) {
+    // Compile only forward methods. forward method contains the entire graph.
+    if (method.name().compare("forward") == 0) {
       auto engine = ConvertGraphToTRTEngine(mod, method.name(), cfg);
       auto new_g = std::make_shared<torch::jit::Graph>();
-      AddEngineToGraph(new_mod, new_g, engine);
+      auto device_spec = cfg.convert_info.engine_settings.device;
+      auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+      AddEngineToGraph(new_mod, new_g, engine, cuda_device);
       auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
       auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
       new_mod.type()->addMethod(new_method);
@@ -364,12 +370,12 @@ torch::jit::script::Module CompileGraph(const torch::jit::script::Module& mod, C
   return new_mod;
 }
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine) {
+torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device) {
   std::ostringstream engine_id;
   engine_id << reinterpret_cast<const int*>(&engine);
   torch::jit::script::Module new_mod("tensorrt_engine_mod_" + engine_id.str());
   auto new_g = std::make_shared<torch::jit::Graph>();
-  AddEngineToGraph(new_mod, new_g, engine);
+  AddEngineToGraph(new_mod, new_g, engine, cuda_device);
   auto new_method = new_mod._ivalue()->compilation_unit()->create_function("forward", new_g);
   auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
   new_mod.type()->addMethod(new_method);
 
@@ -5,6 +5,7 @@
 #include "core/conversion/conversion.h"
 #include "core/ir/ir.h"
 #include "core/partitioning/partitioning.h"
+#include "core/runtime/runtime.h"
 #include "torch/csrc/jit/api/module.h"
 
 namespace trtorch {
@@ -22,7 +23,7 @@ std::string ConvertGraphToTRTEngine(const torch::jit::script::Module& mod, std::
 
 torch::jit::script::Module CompileGraph(const torch::jit::script::Module& module, CompileSpec cfg);
 
-torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine);
+torch::jit::script::Module EmbedEngineInNewModule(const std::string& engine, runtime::CudaDevice cuda_device);
 
 void set_device(const int gpu_id);
 
 
@@ -45,8 +45,15 @@ c10::optional<torch::jit::IValue> EvaluateNode(ConversionCtx* ctx, const torch::
       if (result) {
         // WARN: If the converter returns None then should pass through
         // but if repeated dep this section will get called each time
-        ctx->evaluated_value_map[eval_in] = std::move(result.value());
-        eval_args[eval_in] = &(ctx->evaluated_value_map[eval_in]);
+        auto val = result.value();
+        if (val.isCustomClass()) {
+          auto cont = val.toCustomClass<TensorContainer>();
+          ctx->AssociateValueAndTensor(eval_in, cont->tensor());
+          eval_args[eval_in] = ctx->value_tensor_map[eval_in];
+        } else {
+          ctx->AssociateValueAndIValue(eval_in, val);
+          eval_args[eval_in] = &(ctx->evaluated_value_map[eval_in]);
+        }
       }
     } else {
       TRTORCH_THROW_ERROR(
@@ -374,6 +381,11 @@ void ConvertBlockToNetDef(
           } else {
             TRTORCH_THROW_ERROR("Unsupported return type for evaluated node");
           }
+        } else if (eval.value().isCustomClass()) {
+          auto container = eval.value().toCustomClass<TensorContainer>();
+          auto tensor = container->tensor();
+          LOG_DEBUG(ctx->logger, "Found the value to be an ITensor of shape: " << tensor->getDimensions());
+          ctx->AssociateValueAndTensor(n->output(0), tensor);
         } else if (!eval.value().isTensor()) {
           LOG_DEBUG(ctx->logger, "Found the value to be: " << eval.value());
           ctx->AssociateValueAndIValue(n->output(0), eval.value());
 
@@ -23,16 +23,15 @@ std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) {
        << "\n    Max Workspace Size: " << s.workspace_size;
 
     if (s.max_batch_size != 0) {
-        os << "\n    Max Batch Size: " << s.max_batch_size;
+    os << "\n    Max Batch Size: " << s.max_batch_size;
     } else {
-        os << "\n    Max Batch Size: Not set";
+    os << "\n    Max Batch Size: Not set";
     }
 
     os << "\n    Device Type: " << s.device.device_type                                    \
        << "\n    GPU ID: " << s.device.gpu_id;
-    if (s.device.device_type == nvinfer1::DeviceType::kDLA)
-    {
-        os << "\n    DLACore: " << s.device.dla_core;
+    if (s.device.device_type == nvinfer1::DeviceType::kDLA) {
+    os << "\n    DLACore: " << s.device.dla_core;
     }
     os << "\n    Engine Capability: " << s.capability                                      \
        << "\n    Calibrator Created: " << (s.calibrator != nullptr);
@@ -146,6 +145,9 @@ torch::jit::IValue* ConversionCtx::AssociateValueAndIValue(const torch::jit::Val
 
 std::string ConversionCtx::SerializeEngine() {
   auto engine = builder->buildEngineWithConfig(*net, *cfg);
+  if (!engine) {
+    TRTORCH_THROW_ERROR("Building TensorRT engine failed");
+  }
   auto serialized_engine = engine->serialize();
   engine->destroy();
   auto engine_str = std::string((const char*)serialized_engine->data(), serialized_engine->size());
 
@@ -35,6 +35,7 @@ cc_library(
         "impl/batch_norm.cpp",
         "impl/concat.cpp",
         "impl/constant.cpp",
+        "impl/constant_pad.cpp",
         "impl/conv_deconv.cpp",
         "impl/cumsum.cpp",
         "impl/element_wise.cpp",
 
@@ -53,7 +53,7 @@ nvinfer1::ITensor* addUnpadding(
     TRTORCH_CHECK(shuffle_layer, "Unable to create shuffle layer");
     shuffle_layer->setReshapeDimensions(newDims);
     shuffle_layer->setZeroIsPlaceholder(use_zeros);
-    shuffle_layer->setName((util::node_info(n) + " [Reshape to " + util::toStr(newDims)).c_str() + ']');
+    shuffle_layer->setName((util::node_info(n) + " [Reshape to " + util::toStr(newDims) + "]").c_str());
     return shuffle_layer->getOutput(0);
   } else {
     return tensor;
 
@@ -0,0 +1,156 @@
+#include <ATen/ATen.h>
+#include <vector>
+#include "NvInfer.h"
+#include "core/conversion/converters/converters.h"
+#include "core/util/prelude.h"
+#include "torch/torch.h"
+
+namespace trtorch {
+namespace core {
+namespace conversion {
+namespace converters {
+namespace impl {
+namespace {
+
+auto constant_pad_registrations TRTORCH_UNUSED = RegisterNodeConversionPatterns().pattern(
+    {"aten::constant_pad_nd(Tensor self, int[] pad, Scalar value=0) -> (Tensor)",
+     [](ConversionCtx* ctx, const torch::jit::Node* n, args& args) -> bool {
+       auto in = args[0].ITensor();
+       auto inDims = in->getDimensions();
+       int64_t inRank = inDims.nbDims;
+       auto padding = args[1].unwrapToIntList().vec();
+       int64_t padSize = padding.size();
+       auto value = args[2].unwrapToScalar().to<float>();
+
+       TRTORCH_CHECK(padSize % 2 == 0, "Length of pad must be even but instead it equals " << padSize);
+
+       int64_t l_pad = padSize / 2;
+       TRTORCH_CHECK(
+           inRank >= (int64_t)l_pad,
+           "Length of pad should be no more than twice the number of "
+           "dimensions of the input. Pad length is "
+               << padSize << "while the input has " << inRank << "dimensions.");
+
+       // TODO negative padding. When the pad is negative, we need to crop the image.
+
+       std::vector<nvinfer1::ITensor*> tensors_vec;
+       // input: (N, C, D_in, H_in, W_in).
+       // padding: (padding_left, padding_right, padding_top, padding_bottom, padding_front, padding_back)
+       // When axis is inRank - 1, making W_out = W_in + padding_left + padding_right.
+       // When axis is inRank - 2, making H_out = H_in + padding_top + padding_bottom.
+       // When axis is inRank - 3, making D_out = D_in + padding_front + padding_back.
+       for (int64_t i = 0; i < l_pad; i++) {
+         int64_t axis = inRank - (i + 1); // axis = {inRank - 1, inRank - 2, inRank - 3}
+         int64_t padding_index = i * 2;
+
+         if (padding[padding_index] > 0) { // left/top/front padding value
+           tensors_vec.clear();
+           if (ctx->input_is_dynamic) {
+             at::Tensor left_indices = torch::tensor({0}, torch::kInt32);
+             auto indicesTensor = tensor_to_const(ctx, left_indices);
+             auto left_gather_layer = ctx->net->addGather(*in, *indicesTensor, axis);
+             auto left_gather_out = left_gather_layer->getOutput(0);
+
+             // fill the left_gather_out with value
+             auto fill_layer = ctx->net->addFill(nvinfer1::Dims{1, {1}}, nvinfer1::FillOperation::kLINSPACE);
+             auto shape_gather_out = ctx->net->addShape(*left_gather_out)->getOutput(0);
+             fill_layer->setInput(0, *shape_gather_out);
+             at::Tensor value_tensor = torch::tensor(value, torch::kFloat32);
+             auto valueTensor = tensor_to_const(ctx, value_tensor);
+             fill_layer->setInput(1, *valueTensor);
+             at::Tensor delta_tensor = torch::zeros(inRank);
+             auto deltaTensor = tensor_to_const(ctx, delta_tensor);
+             fill_layer->setInput(2, *deltaTensor);
+             auto padTensor = fill_layer->getOutput(0);
+
+             for (int i = 0; i < padding[padding_index]; i++) {
+               tensors_vec.push_back(padTensor);
+             }
+           } else {
+             inDims.d[axis] = padding[padding_index];
+             auto fill_layer = ctx->net->addFill(inDims, nvinfer1::FillOperation::kLINSPACE);
+             at::Tensor value_tensor = torch::tensor(value, torch::kFloat32);
+             auto valueTensor = tensor_to_const(ctx, value_tensor);
+             fill_layer->setInput(1, *valueTensor);
+             at::Tensor delta_tensor = torch::zeros(inRank);
+             auto deltaTensor = tensor_to_const(ctx, delta_tensor);
+             fill_layer->setInput(2, *deltaTensor);
+             auto padTensor = fill_layer->getOutput(0);
+
+             tensors_vec.push_back(padTensor);
+           }
+
+           tensors_vec.push_back(in);
+           auto concat_layer = ctx->net->addConcatenation(tensors_vec.data(), tensors_vec.size());
+           concat_layer->setAxis(axis);
+           in = concat_layer->getOutput(0);
+           inDims = in->getDimensions();
+         }
+
+         if (padding[padding_index + 1] > 0) { // right/bottom/back padding value
+           tensors_vec.clear();
+           tensors_vec.push_back(in);
+
+           nvinfer1::ITensor* indicesTensor = NULL;
+           if (inDims.d[axis] == -1) {
+             auto shapeTensor = ctx->net->addShape(*in)->getOutput(0);
+             at::Tensor dimValue = torch::tensor({axis}, torch::kInt32);
+             auto dimTensor = tensor_to_const(ctx, dimValue);
+             indicesTensor = ctx->net->addGather(*shapeTensor, *dimTensor, 0)->getOutput(0);
+             auto oneTensor = tensor_to_const(ctx, torch::tensor({1}, torch::kInt32));
+             indicesTensor = ctx->net->addElementWise(*indicesTensor, *oneTensor, nvinfer1::ElementWiseOperation::kSUB)
+                                 ->getOutput(0);
+           } else {
+             auto indices = torch::tensor({inDims.d[axis] - 1}, torch::kInt32);
+             indicesTensor = tensor_to_const(ctx, indices);
+           }
+           auto right_gather_layer = ctx->net->addGather(*in, *indicesTensor, axis);
+           auto right_gather_out = right_gather_layer->getOutput(0);
+
+           if (ctx->input_is_dynamic) {
+             // fill the right_gather_out with value
+             auto fill_layer = ctx->net->addFill(nvinfer1::Dims{1, {1}}, nvinfer1::FillOperation::kLINSPACE);
+             auto shape_gather_out = ctx->net->addShape(*right_gather_out)->getOutput(0);
+             fill_layer->setInput(0, *shape_gather_out);
+             at::Tensor value_tensor = torch::tensor(value, torch::kFloat32);
+             auto valueTensor = tensor_to_const(ctx, value_tensor);
+             fill_layer->setInput(1, *valueTensor);
+             at::Tensor delta_tensor = torch::zeros(inRank);
+             auto deltaTensor = tensor_to_const(ctx, delta_tensor);
+             fill_layer->setInput(2, *deltaTensor);
+             auto padTensor = fill_layer->getOutput(0);
+
+             for (int i = 0; i < padding[padding_index + 1]; i++) {
+               tensors_vec.push_back(padTensor);
+             }
+           } else {
+             inDims.d[axis] = padding[padding_index + 1];
+             auto fill_layer = ctx->net->addFill(inDims, nvinfer1::FillOperation::kLINSPACE);
+             at::Tensor value_tensor = torch::tensor(value, torch::kFloat32);
+             auto valueTensor = tensor_to_const(ctx, value_tensor);
+             fill_layer->setInput(1, *valueTensor);
+             at::Tensor delta_tensor = torch::zeros(inRank);
+             auto deltaTensor = tensor_to_const(ctx, delta_tensor);
+             fill_layer->setInput(2, *deltaTensor);
+             auto padTensor = fill_layer->getOutput(0);
+
+             tensors_vec.push_back(padTensor);
+           }
+           auto concat_layer = ctx->net->addConcatenation(tensors_vec.data(), tensors_vec.size());
+           concat_layer->setAxis(axis);
+           in = concat_layer->getOutput(0);
+           inDims = in->getDimensions();
+         }
+       }
+
+       auto out = ctx->AssociateValueAndTensor(n->outputs()[0], in);
+       LOG_DEBUG("Output tensor shape: " << out->getDimensions());
+       return true;
+     }});
+
+} // namespace
+} // namespace impl
+} // namespace converters
+} // namespace conversion
+} // namespace core
+} // namespace trtorch
@@ -117,12 +117,31 @@ auto layer_norm_registrations TRTORCH_UNUSED = RegisterNodeConversionPatterns().
       }
 
       auto power = Weights(ctx, at::ones(expand_size));
-      auto scale_nd = ctx->net->addScaleNd(
-          *div_out, nvinfer1::ScaleMode::kELEMENTWISE, beta_weights.data, gamma_weights.data, power.data, 1);
-      scale_nd->setName((util::node_info(n) + "_scale_nd").c_str());
-      auto scale_nd_out = scale_nd->getOutput(0);
 
-      ctx->AssociateValueAndTensor(n->outputs()[0], scale_nd_out);
+      auto gamma_tensor = ctx->net->addConstant(gamma_weights.shape, gamma_weights.data)->getOutput(0);
+      auto scale_l = add_elementwise(
+          ctx, nvinfer1::ElementWiseOperation::kPROD, div_out, gamma_tensor, (util::node_info(n) + "_scale").c_str());
+
+      auto beta_tensor = ctx->net->addConstant(beta_weights.shape, beta_weights.data)->getOutput(0);
+      auto shift_l = add_elementwise(
+          ctx,
+          nvinfer1::ElementWiseOperation::kSUM,
+          scale_l->getOutput(0),
+          beta_tensor,
+          (util::node_info(n) + "_shift").c_str());
+
+      auto power_tensor = ctx->net->addConstant(power.shape, power.data)->getOutput(0);
+      auto power_l = add_elementwise(
+          ctx,
+          nvinfer1::ElementWiseOperation::kPOW,
+          shift_l->getOutput(0),
+          power_tensor,
+          (util::node_info(n) + "_power").c_str());
+
+      power_l->setName((util::node_info(n) + "_scale_nd").c_str());
+      auto power_l_out = power_l->getOutput(0);
+
+      ctx->AssociateValueAndTensor(n->outputs()[0], power_l_out);
       return true;
     }});