fix: Bugfix in shape analysis for multi-GPU systems

gs-olive · bowang007 · commit 52152d0155f4 · 2023-04-28T14:46:01.000-07:00
- Shape analysis code in partitioning defaults dry-run tensors to cuda:0
despite user-specified devices
- This leads to errors about device casting for internal tensors, which
users cannot cast
- Add GPU-ID function arguments in functions to generate new tensors on
the user-specified (or default) device
diff --git a/core/partitioning/partitioning.cpp b/core/partitioning/partitioning.cpp
@@ -542,14 +542,26 @@ bool isInputDynamic(PartitioningCtx* ctx) {
 void populateInputIValues(PartitioningCtx* ctx) {
   if (isInputDynamic(ctx)) {
     ctx->min_input_ivalues_map = partitioning::generateRandomInputs(
-        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kMIN);
+        ctx->settings.collection_input_spec_map,
+        ctx->input_types_map,
+        ir::ShapeMode::kMIN,
+        ctx->settings.target_device.gpu_id);
     ctx->opt_input_ivalues_map = partitioning::generateRandomInputs(
-        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kOPT);
+        ctx->settings.collection_input_spec_map,
+        ctx->input_types_map,
+        ir::ShapeMode::kOPT,
+        ctx->settings.target_device.gpu_id);
     ctx->max_input_ivalues_map = partitioning::generateRandomInputs(
-        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kMAX);
+        ctx->settings.collection_input_spec_map,
+        ctx->input_types_map,
+        ir::ShapeMode::kMAX,
+        ctx->settings.target_device.gpu_id);
   } else {
     ctx->opt_input_ivalues_map = partitioning::generateRandomInputs(
-        ctx->settings.collection_input_spec_map, ctx->input_types_map, ir::ShapeMode::kOPT);
+        ctx->settings.collection_input_spec_map,
+        ctx->input_types_map,
+        ir::ShapeMode::kOPT,
+        ctx->settings.target_device.gpu_id);
   }
 }
 
diff --git a/core/partitioning/partitioning.h b/core/partitioning/partitioning.h
@@ -34,7 +34,8 @@ const std::unordered_set<c10::Symbol> CollectionNodeKinds = {
 ExampleIValues generateRandomInputs(
     ir::CollectionInputSpecMap& input_ranges,
     ir::CollectionTypeMap& input_types,
-    const ir::ShapeMode& shape_mode = ir::ShapeMode::kOPT);
+    const ir::ShapeMode& shape_mode = ir::ShapeMode::kOPT,
+    int64_t gpu_id = 0);
 
 void populateInputIValues(PartitioningCtx* ctx);
 
diff --git a/core/partitioning/shape_analysis.cpp b/core/partitioning/shape_analysis.cpp
@@ -13,7 +13,8 @@ namespace partitioning {
 at::Tensor generateSingleInput(
     ir::Input& input,
     c10::optional<at::ScalarType>& type_opt,
-    const ir::ShapeMode& shape_mode) {
+    const ir::ShapeMode& shape_mode,
+    int64_t gpu_id) {
   nvinfer1::Dims input_shape = input.input_shape;
   if (input.input_is_dynamic) {
     if (shape_mode == ir::ShapeMode::kMIN) {
@@ -42,15 +43,17 @@ at::Tensor generateSingleInput(
 
   // Make the value range for input tensor a uniform (float) distribution
   // over [LoValIncl, HiValExcl), then cast to the desired dtype
-  auto in = ((HiValExcl - LoValIncl) * at::rand(util::toVec(input_shape), {at::kCUDA}) + LoValIncl).to(type);
+  auto in = ((HiValExcl - LoValIncl) * at::rand(util::toVec(input_shape)) + LoValIncl)
+                .to(at::Device(at::kCUDA, gpu_id), type);
 
   return in;
 }
 
 std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomInputs(
     std::unordered_map<const torch::jit::Value*, std::vector<ir::Input>>& inputs,
     std::unordered_map<const torch::jit::Value*, std::vector<c10::optional<at::ScalarType>>>& types,
-    const ir::ShapeMode& shape_mode) {
+    const ir::ShapeMode& shape_mode,
+    int64_t gpu_id) {
   // generate random inputs for running pytorch segments
   std::unordered_map<const torch::jit::Value*, torch::jit::IValue> ivalue_map;
 
@@ -59,21 +62,21 @@ std::unordered_map<const torch::jit::Value*, torch::jit::IValue> generateRandomI
       c10::TypePtr elementType = c10::TensorType::get();
       auto generic_list = c10::impl::GenericList(elementType);
       for (size_t i = 0; i < input.second.size(); i++) {
-        auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode);
+        auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode, gpu_id);
         generic_list.push_back(in.clone());
       }
       ivalue_map[input.first] = c10::IValue(generic_list);
     } else if (input.first->type()->kind() == torch::jit::TypeKind::TupleType) {
       // create tuple
       std::vector<torch::jit::IValue> list;
       for (size_t i = 0; i < input.second.size(); i++) {
-        auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode);
+        auto in = generateSingleInput(input.second[i], types[input.first][i], shape_mode, gpu_id);
         list.push_back(in.clone());
       }
       auto tuple = c10::ivalue::Tuple::create(list); // create tuple ptr
       ivalue_map[input.first] = c10::IValue(tuple);
     } else {
-      auto in = generateSingleInput(input.second[0], types[input.first][0], shape_mode);
+      auto in = generateSingleInput(input.second[0], types[input.first][0], shape_mode, gpu_id);
       ivalue_map[input.first] = in.clone();
     }
   }