pytorch
diff --git a/‎README.md
Lines changed: 16 additions & 12 deletions b/‎README.md
Lines changed: 16 additions & 12 deletions
diff --git a/‎core/compiler.cpp
Lines changed: 46 additions & 6 deletions b/‎core/compiler.cpp
Lines changed: 46 additions & 6 deletions
diff --git a/‎core/compiler.h
Lines changed: 1 addition & 1 deletion b/‎core/compiler.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/conversion/conversion.cpp
Lines changed: 43 additions & 20 deletions b/‎core/conversion/conversion.cpp
Lines changed: 43 additions & 20 deletions
diff --git a/‎core/conversion/conversion.h
Lines changed: 2 additions & 3 deletions b/‎core/conversion/conversion.h
Lines changed: 2 additions & 3 deletions
diff --git a/‎core/conversion/conversionctx/ConversionCtx.cpp
Lines changed: 31 additions & 25 deletions b/‎core/conversion/conversionctx/ConversionCtx.cpp
Lines changed: 31 additions & 25 deletions
diff --git a/‎core/conversion/conversionctx/ConversionCtx.h
Lines changed: 4 additions & 3 deletions b/‎core/conversion/conversionctx/ConversionCtx.h
Lines changed: 4 additions & 3 deletions
diff --git a/‎core/conversion/converters/impl/activation.cpp
Lines changed: 3 additions & 1 deletion b/‎core/conversion/converters/impl/activation.cpp
Lines changed: 3 additions & 1 deletion
@@ -18,9 +18,13 @@ More Information / System Architecture:
 #include "trtorch/trtorch.h"
 
 ...
-auto compile_settings = trtorch::CompileSpec(dims);
+// Set input datatypes. Allowerd options torch::{kFloat, kHalf, kChar, kInt32, kBool}
+// Size of input_dtypes should match number of inputs to the network.
+// If input_dtypes is not set, default precision follows traditional PyT / TRT rules
+auto input = trtorch::CompileSpec::Input(dims, torch::kHalf)
+auto compile_settings = trtorch::CompileSpec({input});
 // FP16 execution
-compile_settings.op_precision = torch::kFloat;
+compile_settings.enabled_precisions = {torch::kHalf};
 // Compile module
 auto trt_mod = trtorch::CompileGraph(ts_mod, compile_settings);
 // Run like normal
@@ -36,14 +40,14 @@ import trtorch
 
 ...
 compile_settings = {
-    "input_shapes": [
-        {
-            "min": [1, 3, 224, 224],
-            "opt": [1, 3, 512, 512],
-            "max": [1, 3, 1024, 1024]
-        }, # For static size [1, 3, 224, 224]
-    ],
-    "op_precision": torch.half # Run with FP16
+    "inputs": [trtorch.Input(
+        min_shape=[1, 3, 224, 224],
+        opt_shape=[1, 3, 512, 512],
+        max_shape=[1, 3, 1024, 1024]
+        # For static size shape=[1, 3, 224, 224]
+        dtype=torch.half, # Datatype of input tensor. Allowed options torch.(float|half|int8|int32|bool)
+    )],
+    "enabled_precision": {torch.half}, # Run with FP16
 }
 
 trt_ts_module = trtorch.compile(torch_script_module, compile_settings)
@@ -54,9 +58,9 @@ torch.jit.save(trt_ts_module, "trt_torchscript_module.ts")
 ```
 
 > Notes on running in lower precisions:
-> - Set precision with compile_spec.op_precision
+> - Enabled lower precisions with compile_spec.enabled_precisions
 > - The module should be left in FP32 before compilation (FP16 can support half tensor models)
-> - In FP16 only input tensors should be converted to FP16, other precisions use FP32
+> - In FP16 only input tensors by default should be FP16, other precisions use FP32. This can be overrided by setting Input::dtype
 
 ## Platform Support
 
 
@@ -259,12 +259,12 @@ GraphAndMapping ConstructFallbackGraph(
     trt_engine_id << reinterpret_cast<const int*>(&seg_block);
 
     if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) {
-      std::vector<ir::InputRange> input_ranges;
+      std::vector<ir::Input> inputs;
       for (auto& shape : seg_block.in_shape()) {
-        input_ranges.push_back(ir::InputRange(shape));
+        inputs.push_back(ir::InputRange(shape));
       }
       // update the input ranges for each segments
-      convert_cfg.input_ranges = input_ranges;
+      convert_cfg.inputs = inputs;
       auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params);
       auto temp_g = std::make_shared<torch::jit::Graph>();
       auto device_spec = convert_cfg.engine_settings.device;
@@ -316,11 +316,11 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
       auto named_params = conversion::get_named_params(g->inputs(), params);
       LOG_INFO("(LoweredGraph)\n" << *g);
 
-      std::unordered_map<torch::jit::Value*, ir::InputRange> input_ranges;
+      std::unordered_map<torch::jit::Value*, ir::Input> inputs;
       for (size_t i = 0; i < g->inputs().size(); ++i) {
-        input_ranges.insert({g->inputs()[i], cfg.convert_info.input_ranges[i]});
+        inputs.insert({g->inputs()[i], cfg.convert_info.inputs[i]});
       }
-      auto input_ivalues_map = partitioning::generateRandomInputs(input_ranges);
+      auto input_ivalues_map = partitioning::generateRandomInputs(inputs);
       auto graph_and_mapping = ConstructFallbackGraph(new_mod, g->block(), input_ivalues_map, cfg, named_params);
       new_g = graph_and_mapping.first;
       LOG_INFO("(FallbackGraph)\n" << *new_g);
@@ -332,6 +332,46 @@ torch::jit::script::Module CompileGraphWithFallback(const torch::jit::script::Mo
         return mod;
       }
 
+// <<<<<<< HEAD
+// =======
+//       std::unordered_map<torch::jit::Value*, torch::jit::Value*> old_to_new_g;
+//       // add global graph's input to old_to_new_g mapping
+//       for (auto input : g->inputs()) {
+//         util::getOrAddInputForValue(input, new_g, old_to_new_g);
+//       }
+//       for (auto& seg_block : segmented_blocks) {
+//         std::string cur_block_target =
+//             seg_block.target() == partitioning::SegmentedBlock::kTensorRT ? "TensorRT" : "Torch";
+//         LOG_INFO(*seg_block.g() << "(Sub Graph" << cur_block_target << "Block)\n");
+//         std::ostringstream trt_engine_id;
+//         trt_engine_id << reinterpret_cast<const int*>(&seg_block);
+//         if (seg_block.target() == partitioning::SegmentedBlock::kTensorRT) {
+//           std::vector<ir::Input> inputs;
+//           for (auto& shape : seg_block.in_shape()) {
+//             inputs.push_back(ir::Input(shape));
+//           }
+//           // update the input ranges for each segments
+//           convert_cfg.inputs = inputs;
+//           auto engine = conversion::ConvertBlockToEngine(seg_block.block(), convert_cfg, named_params);
+//           auto temp_g = std::make_shared<torch::jit::Graph>();
+//           auto device_spec = convert_cfg.engine_settings.device;
+//           auto cuda_device = runtime::CudaDevice(device_spec.gpu_id, device_spec.device_type);
+//           AddEngineToGraph(new_mod, temp_g, engine, cuda_device, trt_engine_id.str(), true);
+//
+//           seg_block.update_graph(temp_g);
+//           AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g);
+//         } else {
+//           AddSegmentedBlockToGraph(new_g, seg_block, old_to_new_g);
+//         }
+//       }
+//
+//       for (auto& output : g->outputs()) {
+//         new_g->registerOutput(old_to_new_g[output]);
+//       }
+//
+//       LOG_INFO(*new_g << "(FallbackGraph)\n");
+//
+// >>>>>>> master
       auto new_method = new_mod._ivalue()->compilation_unit()->create_function(method.name(), new_g);
       auto schema = util::GenerateGraphSchema(new_method->name(), new_g);
       new_mod.type()->addMethod(new_method);
 
@@ -12,7 +12,7 @@ namespace trtorch {
 namespace core {
 
 struct CompileSpec {
-  CompileSpec(std::vector<ir::InputRange> input_ranges) : convert_info(std::move(input_ranges)) {}
+  CompileSpec(std::vector<ir::Input> inputs) : convert_info(std::move(inputs)) {}
   conversion::ConversionInfo convert_info;
   partitioning::PartitionInfo partition_info;
 };
 
@@ -125,10 +125,7 @@ void AddLayer(ConversionCtx* ctx, const torch::jit::Node* n) {
                        << "please report this error to https://www.github.com/NVIDIA/TRTorch/issues");
 }
 
-void AddInputs(
-    ConversionCtx* ctx,
-    at::ArrayRef<const torch::jit::Value*> inputs,
-    std::vector<ir::InputRange>& input_dims) {
+void AddInputs(ConversionCtx* ctx, at::ArrayRef<const torch::jit::Value*> inputs, std::vector<ir::Input>& input_specs) {
   std::vector<const torch::jit::Value*> input_tensors;
   for (auto in : inputs) {
     // Disregarding inputs that are not tensors
@@ -142,29 +139,40 @@ void AddInputs(
     }
   }
 
+  std::stringstream ss;
+  ss << "Input Dimension Specs: [\n";
+  for (auto i : input_specs) {
+    ss << "    " << i << ",";
+  }
+  ss << ']';
+  LOG_DEBUG(ctx->logger, ss.str());
+
   TRTORCH_CHECK(
-      input_tensors.size() == input_dims.size(),
+      input_tensors.size() == input_specs.size(),
       "Expected dimension specifications for all input tensors"
-          << ", but found " << input_tensors.size() << " input tensors and " << input_dims.size()
+          << ", but found " << input_tensors.size() << " input tensors and " << input_specs.size()
           << " dimension specs (conversion.AddInputs)");
 
   auto profile = ctx->builder->createOptimizationProfile();
 
   for (size_t i = 0; i < input_tensors.size(); i++) {
     auto in = input_tensors[i];
-    auto dims = input_dims[i];
+    auto spec = input_specs[i];
     std::string name = std::string("input_") + std::to_string(ctx->num_inputs);
     LOG_INFO(
-        ctx->logger, "Adding Input " << in->debugName() << " named " << name << " in engine (conversion.AddInputs)");
-    LOG_DEBUG(ctx->logger, "Input shape set to " << dims.input_shape);
-    auto trt_in = ctx->net->addInput(name.c_str(), ctx->input_type, dims.input_shape);
+        ctx->logger,
+        "Adding Input " << in->debugName() << " (named: " << name << "): " << spec
+                        << " in engine (conversion.AddInputs)");
+
+    auto trt_in = ctx->net->addInput(name.c_str(), spec.dtype, spec.input_shape);
     TRTORCH_CHECK(trt_in, "Failed to add input node: " << in->debugName() << " (conversion.AddInputs)");
+    trt_in->setAllowedFormats(1U << static_cast<int>(spec.format));
 
-    profile->setDimensions(trt_in->getName(), nvinfer1::OptProfileSelector::kMIN, dims.min);
-    profile->setDimensions(trt_in->getName(), nvinfer1::OptProfileSelector::kOPT, dims.opt);
-    profile->setDimensions(trt_in->getName(), nvinfer1::OptProfileSelector::kMAX, dims.max);
+    profile->setDimensions(trt_in->getName(), nvinfer1::OptProfileSelector::kMIN, spec.min);
+    profile->setDimensions(trt_in->getName(), nvinfer1::OptProfileSelector::kOPT, spec.opt);
+    profile->setDimensions(trt_in->getName(), nvinfer1::OptProfileSelector::kMAX, spec.max);
 
-    if (dims.input_is_dynamic) {
+    if (spec.input_is_dynamic) {
       ctx->input_is_dynamic = true;
     }
 
@@ -178,7 +186,7 @@ void AddInputs(
 
   ctx->cfg->addOptimizationProfile(profile);
 #if NV_TENSORRT_MAJOR > 7 || (NV_TENSORRT_MAJOR == 7 && NV_TENSORRT_MINOR >= 1)
-  if (ctx->op_precision == nvinfer1::DataType::kINT8) {
+  if (ctx->enabled_precisions.find(nvinfer1::DataType::kINT8) != ctx->enabled_precisions.end()) {
     ctx->cfg->setCalibrationProfile(profile);
   }
 #endif
@@ -350,7 +358,7 @@ void ConvertBlockToNetDef(
 
   auto inputs = b->inputs();
   AddParamsToCtxValueMap(ctx, static_params);
-  AddInputs(ctx, inputs, build_info.input_ranges);
+  AddInputs(ctx, inputs, build_info.inputs);
 
   auto nodes = b->nodes();
 
@@ -428,8 +436,8 @@ std::string ConvertBlockToEngine(const torch::jit::Block* b, ConversionInfo buil
   return engine;
 }
 
-std::set<std::string> GetUnsupportedOpsInBlock(const torch::jit::Block* b) {
-  std::set<std::string> unsupported_ops;
+std::unordered_map<c10::OperatorName, std::string> GetUnsupportedOpsInBlock(const torch::jit::Block* b) {
+  std::unordered_map<c10::OperatorName, std::string> unsupported_ops;
   for (const auto n : b->nodes()) {
     if (n->kind() != torch::jit::prim::Loop && n->kind() != torch::jit::prim::If && !OpSupported(n)) {
       auto schema = n->maybeSchema();
@@ -438,7 +446,7 @@ std::set<std::string> GetUnsupportedOpsInBlock(const torch::jit::Block* b) {
           "Unable to get schema for Node " << util::node_info(n) << " (conversion.VerifyCoverterSupportForBlock)");
       std::stringstream ss;
       ss << *schema;
-      unsupported_ops.insert(ss.str());
+      unsupported_ops[schema->operator_name()] = ss.str();
     }
     for (const auto sub_b : n->blocks()) {
       auto sub_b_unsupported_ops = GetUnsupportedOpsInBlock(sub_b);
@@ -480,12 +488,27 @@ bool VerifyConverterSupportForBlock(const torch::jit::Block* b) {
     unsupported_msg << "Method requested cannot be compiled by TRTorch.\nUnsupported operators listed below:"
                     << std::endl;
     for (auto s : unsupported_ops) {
-      unsupported_msg << "  -  " << s << std::endl;
+      unsupported_msg << "  - " << s.second << std::endl;
     }
     unsupported_msg << "You can either implement converters for these ops in your application or request implementation"
                     << std::endl;
     unsupported_msg << "https://www.github.com/nvidia/TRTorch/issues" << std::endl;
+    unsupported_msg << std::endl << "In Module:" << std::endl;
+
     LOG_ERROR(unsupported_msg.str());
+
+    for (const auto n : b->nodes()) {
+      auto schema = n->maybeSchema();
+      if (schema) {
+        for (const auto& x : unsupported_ops) {
+          if (x.first == schema->operator_name()) {
+            LOG_ERROR(
+                "Unsupported operator: " << *schema << std::endl
+                                         << trtorch::core::util::GetPyTorchSourceCode(n) << std::endl);
+          }
+        }
+      }
+    }
     return false;
   }
 
 
@@ -12,10 +12,9 @@ namespace core {
 namespace conversion {
 
 struct ConversionInfo {
-  std::vector<ir::InputRange> input_ranges;
+  std::vector<ir::Input> inputs;
   BuilderSettings engine_settings;
-  ConversionInfo(std::vector<ir::InputRange> input_ranges)
-      : input_ranges(std::move(input_ranges)), engine_settings(BuilderSettings()) {}
+  ConversionInfo(std::vector<ir::Input> inputs) : inputs(std::move(inputs)), engine_settings(BuilderSettings()) {}
 };
 
 // TODO: REMOVE GRAPH AND PARAMS AND MOVE FULLY TO INLINED CONSTANTS
 
@@ -10,8 +10,11 @@ namespace conversion {
 // clang-format off
 std::ostream& operator<<(std::ostream& os, const BuilderSettings& s) {
     os << "Settings requested for TensorRT engine:"                                        \
-       << "\n    Operating Precision: " << s.op_precision                                  \
-       << "\n    TF32 Floating Point Computation Enabled: " << !s.disable_tf32             \
+       << "\n    Enabled Precisions: ";
+       for (auto p = s.enabled_precisions.begin(); p != s.enabled_precisions.end(); ++p) {
+        os << *p << ' ';
+       }
+    os << "\n    TF32 Floating Point Computation Enabled: " << !s.disable_tf32             \
        << "\n    Truncate Long and Double: " << s.truncate_long_and_double                 \
        << "\n    Make Refittable Engine: " << s.refit                                      \
        << "\n    Debuggable Engine: " << s.debug                                           \
@@ -57,30 +60,31 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
   LOG_DEBUG(build_settings);
   cfg = builder->createBuilderConfig();
 
-  switch (settings.op_precision) {
-    case nvinfer1::DataType::kHALF:
-      TRTORCH_CHECK(builder->platformHasFastFp16(), "Requested inference in FP16 but platform does not support FP16");
-      cfg->setFlag(nvinfer1::BuilderFlag::kFP16);
-      input_type = nvinfer1::DataType::kHALF;
-      break;
-    case nvinfer1::DataType::kINT8:
-      TRTORCH_CHECK(builder->platformHasFastInt8(), "Requested inference in INT8 but platform does not support INT8");
-      cfg->setFlag(nvinfer1::BuilderFlag::kINT8);
-      if (!settings.strict_types) {
+  for (auto p = settings.enabled_precisions.begin(); p != settings.enabled_precisions.end(); ++p) {
+    switch (*p) {
+      case nvinfer1::DataType::kHALF:
+        TRTORCH_CHECK(builder->platformHasFastFp16(), "Requested inference in FP16 but platform does not support FP16");
         cfg->setFlag(nvinfer1::BuilderFlag::kFP16);
-      }
-      input_type = nvinfer1::DataType::kFLOAT;
-      TRTORCH_CHECK(
-          settings.calibrator != nullptr,
-          "Requested inference in INT8 but no calibrator provided, set the ptq_calibrator field in the CompileSpec struct with your calibrator");
-      cfg->setInt8Calibrator(settings.calibrator);
-      break;
-    case nvinfer1::DataType::kFLOAT:
-    default:
-      input_type = nvinfer1::DataType::kFLOAT;
-      break;
+        break;
+      case nvinfer1::DataType::kINT8:
+        TRTORCH_CHECK(builder->platformHasFastInt8(), "Requested inference in INT8 but platform does not support INT8");
+        cfg->setFlag(nvinfer1::BuilderFlag::kINT8);
+        TRTORCH_CHECK(
+            settings.calibrator != nullptr,
+            "Requested inference in INT8 but no calibrator provided, set the ptq_calibrator field in the CompileSpec struct with your calibrator");
+        cfg->setInt8Calibrator(settings.calibrator);
+        break;
+      case nvinfer1::DataType::kFLOAT:
+        break;
+      case nvinfer1::DataType::kINT32:
+      case nvinfer1::DataType::kBOOL:
+      default:
+        TRTORCH_THROW_ERROR(
+            "Requested kernel precision that is unsupported: " << *p << " options are float, half, int8");
+    }
   }
-  op_precision = settings.op_precision;
+
+  enabled_precisions = settings.enabled_precisions;
 
   if (settings.disable_tf32) {
     cfg->clearFlag(nvinfer1::BuilderFlag::kTF32);
@@ -118,7 +122,9 @@ ConversionCtx::ConversionCtx(BuilderSettings build_settings)
         static_cast<int>(settings.device.dla_core) < nbDLACores,
         "Configured DLA Core ID: " << settings.device.dla_core
                                    << " not available. Total number of available DLA Cores: " << nbDLACores);
-    TRTORCH_CHECK(settings.op_precision != nvinfer1::DataType::kFLOAT, "DLA supports only fp16 or int8 precision");
+    TRTORCH_CHECK(
+        settings.enabled_precisions.find(nvinfer1::DataType::kFLOAT) == settings.enabled_precisions.end(),
+        "DLA supports only fp16 or int8 precision");
     cfg->setDLACore(settings.device.dla_core);
   }
 }
 
@@ -2,6 +2,7 @@
 
 #include <map>
 #include <memory>
+#include <set>
 #include <unordered_map>
 
 #include "NvInfer.h"
@@ -23,7 +24,8 @@ struct Device {
 };
 
 struct BuilderSettings {
-  nvinfer1::DataType op_precision = nvinfer1::DataType::kFLOAT;
+  std::set<nvinfer1::DataType> enabled_precisions = {nvinfer1::DataType::kFLOAT};
+  std::vector<nvinfer1::DataType> input_dtypes;
   bool disable_tf32 = false;
   bool refit = false;
   bool debug = false;
@@ -57,8 +59,7 @@ struct ConversionCtx {
   nvinfer1::IBuilder* builder;
   nvinfer1::INetworkDefinition* net;
   nvinfer1::IBuilderConfig* cfg;
-  nvinfer1::DataType input_type;
-  nvinfer1::DataType op_precision;
+  std::set<nvinfer1::DataType> enabled_precisions;
   BuilderSettings settings;
   util::logging::TRTorchLogger logger;
   // Pointers to data that needs to remain alive until conversion is done
 
@@ -177,7 +177,9 @@ auto acthardtanh TRTORCH_UNUSED =
                     std::string pluginName = "CustomGeluPluginDynamic";
                     nvinfer1::PluginFieldCollection fc;
                     std::vector<nvinfer1::PluginField> f;
-                    int type_id = ctx->settings.op_precision == nvinfer1::DataType::kFLOAT
+                    // REVIEW is this right?
+                    int type_id = ctx->settings.enabled_precisions.find(nvinfer1::DataType::kHALF) ==
+                            ctx->settings.enabled_precisions.end()
                         ? 0
                         : 1; // Integer encoding the DataType (0: FP32, 1: FP16)
                     f.emplace_back(nvinfer1::PluginField("type_id", &type_id, nvinfer1::PluginFieldType::kINT32, 1));