Cherry pick (1) linear + tanh fusion (2) fix dim size 1 issue (3) fix shuffle2d (#711)

jianan-gu · chunyuan-w · web-flow · commit b24cc530b1fd · 2022-04-27T12:19:24.000+08:00
* Add linear+tanh fusion for inference (#685) * init linear tanh fusion * add ut * Check tensor stride if it contains size 1 before passing to OneDNN (#689) * init fix * add condition for channelslast contiguous * add comments and refine the code * rebase linear pattern * Fix shufflenet reg with dynamic shape context shuffle2d pattern (#724) * fix shufflenet reg with dynamic shape context pattern * refine code * refine filter code and add no match ut Co-authored-by: chunyuan-w <chunyuan.wu@intel.com>
diff --git a/intel_extension_for_pytorch/csrc/cpu/ideep/IDeepConversions.cpp b/intel_extension_for_pytorch/csrc/cpu/ideep/IDeepConversions.cpp
@@ -38,6 +38,53 @@ using IDeepTensorWrapperPtr = c10::intrusive_ptr<IDeepTensorWrapper>;
 using MKLDNNTensorImpl = at::OpaqueTensorImpl<IDeepTensorWrapperPtr>;
 using MKLDNNTensor = at::Tensor;
 
+dnnl::memory::dims get_stride_with_size_1_fix(const at::Tensor& tensor) {
+  bool need_check_stride = false;
+  bool is_channelslast_contiguous = false;
+  auto strides_ = tensor.strides().vec();
+  auto dim_ = tensor.dim();
+  // check if the tensor need to check (dim size contains 1 and is contiguous)
+  for (int i = 0; i < dim_; i++) {
+    if (tensor.size(i) == 1) {
+      if (tensor.is_contiguous()) {
+        need_check_stride = true;
+      } else if (
+          tensor.is_contiguous(at::MemoryFormat::ChannelsLast) ||
+          tensor.is_contiguous(at::MemoryFormat::ChannelsLast3d)) {
+        is_channelslast_contiguous = true;
+        need_check_stride = true;
+      }
+      break;
+    }
+  }
+  if (need_check_stride) {
+    // default contiguous dim is last dim, while channel last contiguous dim fix
+    // to channel dim (idx = 1)
+    int contiguous_idx = is_channelslast_contiguous ? 1 : dim_ - 1;
+    // contiguous dim must have stride 1
+    strides_[contiguous_idx] = 1;
+    // loop for checking each dim from last to first
+    for (int i = dim_ - 1; i >= 0; i--) {
+      // only check stride where dim size is 1 and not the contiguous dim that
+      // has already set
+      if (tensor.size(i) == 1 && i != contiguous_idx) {
+        if (i == dim_ - 1 && is_channelslast_contiguous) {
+          // handle the last dim when channel last contiguous
+          strides_[i] = tensor.size(contiguous_idx) * strides_[contiguous_idx];
+        } else if (i == 0 && is_channelslast_contiguous) {
+          // handle the first dim when channel last contiguous
+          strides_[i] = tensor.size(2) * strides_[2];
+        } else {
+          // for other cases, they are next_dim_stride*next_dim_size since
+          // stride computation order is from last to first
+          strides_[i] = tensor.size(i + 1) * strides_[i + 1];
+        }
+      }
+    }
+  }
+  return strides_;
+}
+
 ideep::tensor::data_type get_mkldnn_dtype(at::ScalarType type) {
   switch (type) {
     case at::ScalarType::Float:
@@ -76,10 +123,11 @@ ideep::tensor itensor_view_from_dense(const at::Tensor& tensor) {
       tensor.scalar_type() == at::ScalarType::Float ||
           tensor.scalar_type() == at::ScalarType::BFloat16,
       "itensor_view_from_dense expects float tensor input");
+
   return {
       {tensor.sizes().vec(),
        get_mkldnn_dtype(tensor.scalar_type()),
-       tensor.strides().vec()},
+       get_stride_with_size_1_fix(tensor)},
       tensor.data_ptr()};
 }
 
diff --git a/intel_extension_for_pytorch/csrc/cpu/ideep/IDeepConversions.h b/intel_extension_for_pytorch/csrc/cpu/ideep/IDeepConversions.h
@@ -38,5 +38,22 @@ at::Tensor empty_aten_tensor_from_desc(
     const ideep::tensor::desc& desc,
     const at::TensorOptions& options);
 
+// ##Background##
+// This function returns the input tensor's stride with a workaround that checks
+// (and fixes) the stride when the input tensor has dim size 1. Currently oneDNN
+// is not expected the behavior that with dim size 1, a PyTorch tensor's stride
+// is meanless and may not follow strict contiguous context, which may make
+// oneDNN go into ref path (perf drop). For example: A tensor with shape [1,
+// 768] and stride [1536, 1] is not expected to current oneDNN though PyTorch
+// will think it is contiguous since dim0 is size 1. Such a Tensor can be
+// constructed by slice [:,0,:] from another tensor with shape [1, 2, 768] and
+// stride [1536, 768, 1], and it is a real case in Albert model pooler layer.
+// ##Performance Impact##
+// It takes ~0.05us on average for calling this function when creating a mkldnn
+// tensor.
+// ##TODO##
+// Will remove this workaround after oneDNN's fix.
+dnnl::memory::dims get_stride_with_size_1_fix(const at::Tensor& tensor);
+
 } // namespace cpu
 } // namespace torch_ipex
diff --git a/intel_extension_for_pytorch/csrc/cpu/ideep/ideep/attributes.hpp b/intel_extension_for_pytorch/csrc/cpu/ideep/ideep/attributes.hpp
@@ -101,6 +101,17 @@ struct attr_t : public dnnl::primitive_attr {
     return attr;
   }
 
+  static attr_t fuse_tanh(
+      float scale = 1.0,
+      float alpha = 0.f,
+      float beta = 0.f) {
+    attr_t attr;
+    post_ops po;
+    po.append_eltwise(scale, algorithm::eltwise_tanh, alpha, beta);
+    attr.set_post_ops(po);
+    return attr;
+  }
+
   static attr_t fuse_elu(
       float scale = 1.0,
       float alpha = 0.f,
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/LinearPacked.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/LinearPacked.cpp
@@ -69,6 +69,15 @@ at::Tensor linear_gelu_run(
       input, ideep::attr_t::fuse_gelu(1.0, 0.f, 0.f, gelu_type));
 }
 
+at::Tensor linear_tanh_run(
+    const at::Tensor& input,
+    const c10::intrusive_ptr<LinearOpContext>& op_context) {
+  IPEX_RECORD_FUNCTION(
+      "ipex_prepack::linear_tanh_run", std::vector<c10::IValue>({}));
+
+  return op_context->run(input, ideep::attr_t::fuse_tanh());
+}
+
 at::Tensor linear_sigmoid_run(
     const at::Tensor& input,
     const c10::intrusive_ptr<LinearOpContext>& op_context) {
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/kernels/LinearPacked.h b/intel_extension_for_pytorch/csrc/jit/cpu/kernels/LinearPacked.h
@@ -29,6 +29,10 @@ at::Tensor linear_gelu_run(
     const c10::intrusive_ptr<LinearOpContext>& op_context,
     c10::string_view approximate);
 
+at::Tensor linear_tanh_run(
+    const at::Tensor& input,
+    const c10::intrusive_ptr<LinearOpContext>& op_context);
+
 at::Tensor linear_sigmoid_run(
     const at::Tensor& input,
     const c10::intrusive_ptr<LinearOpContext>& op_context);
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite.cpp
@@ -24,25 +24,55 @@ c10::optional<IValue> getIValue(
   return toIValue(getValue(name, match_vmap, vmap));
 }
 
+// FuseShuffle is matching the channelshuffle pattern, where:
+// (1) the first view is [n, c, h, w] => [n, groups, c // groups, h, w]
+// (2) the tranpose is for groups => [n, c // groups, grpups, h, w]
+// (3) the output view shape should be the same as the input tensor shape
 void FuseShuffle(std::shared_ptr<Graph>& graph) {
-  std::string shuffle = R"(
+  // below is channelshuffle for staic view shape pattern
+  std::string channelshuffle_with_static_shape = R"(
       graph(%input, %view_shape:int[], %trans_dim0:int, %trans_dim1:int, %mem_format:int, %flattern_shape:int[]):
         %r1 = aten::view(%input, %view_shape)
         %r2 = aten::transpose(%r1, %trans_dim0, %trans_dim1)
         %r3 = aten::contiguous(%r2, %mem_format)
         %r4 = aten::view(%r3, %flattern_shape)
         return (%r4) )";
 
-  std::string shuffle_2d_fusion = R"(
+  std::string shuffle_2d_fusion_with_static_shape = R"(
       graph(%input, %view_shape:int[], %trans_dim0:int, %trans_dim1:int, %mem_format:int, %flattern_shape:int[]):
         %r = ipex::shuffle_2d(%input, %view_shape, %trans_dim0, %trans_dim1)
         return (%r) )";
 
-  // this filter passes only for the following conditions:
-  // (1) the first view is [n, c, h, w] => [n, groups, c // groups, h, w]
-  // (2) the tranpose is for groups => [n, c // groups, grpups, h, w]
-  // (3) the output view shape should be the same as the input tensor shape
-  auto filter_shuffle_2d_fusion =
+  // below is channelshuffle for dynamic view shape pattern
+  std::string dynamic_shape_input = R"(
+      graph(%input, %idx_0:int, %idx_1:int, %idx_2:int, %idx_3:int, %div_g, %g:int, %type, %flattern_c):
+        %n_ = aten::size(%input, %idx_0)
+        %c_ = aten::size(%input, %idx_1)
+        %tensor_c_ = prim::NumToTensor(%c_)
+        %h_ = aten::size(%input, %idx_2)
+        %w_ = aten::size(%input, %idx_3)
+        %c_div_g_ = aten::div(%tensor_c_, %div_g, %type)
+        %int_c_div_g_ = aten::Int(%c_div_g_)
+        %view_shape:int[] = prim::ListConstruct(%n_, %g, %int_c_div_g_, %h_, %w_) )";
+
+  std::string channelshuffle_for_dynamic_shape = R"(
+        %r1 = aten::view(%input, %view_shape)
+        %r2 = aten::transpose(%r1, %idx_1, %idx_2)
+        %r3 = aten::contiguous(%r2, %idx_0)
+        %flattern_shape:int[] = prim::ListConstruct(%n_, %flattern_c, %h_, %w_)
+        %r4 = aten::view(%r3, %flattern_shape)
+        return (%r4) )";
+
+  std::string shuffle_2d_fusion_for_dynamic_shape = R"(
+        %r = ipex::shuffle_2d(%input, %view_shape, %idx_1, %idx_2)
+        return (%r) )";
+
+  std::string channelshuffle_with_dynamic_shape =
+      dynamic_shape_input + channelshuffle_for_dynamic_shape;
+  std::string shuffle_2d_fusion_with_dynamic_shape =
+      dynamic_shape_input + shuffle_2d_fusion_for_dynamic_shape;
+
+  auto filter_shuffle_2d_static_fusion =
       [](const Match& match,
          const std::unordered_map<std::string, Value*>& vmap) {
         const auto& match_vmap = match.values_map;
@@ -86,11 +116,12 @@ void FuseShuffle(std::shared_ptr<Graph>& graph) {
           return false;
         }
 
-        // if the view shape and flattern shape is not set
+        // if the view shape or flattern shape is not set
         if (!toIValue(view_shape_).has_value() ||
             !toIValue(flattern_shape_).has_value()) {
           return false;
         }
+
         auto view_shape_list = toIValue(view_shape_).value().toIntVector();
         auto flattern_shape_list =
             toIValue(flattern_shape_).value().toIntVector();
@@ -134,10 +165,43 @@ void FuseShuffle(std::shared_ptr<Graph>& graph) {
         return true;
       };
 
-  SubgraphRewriter rewriter_shuffle_2d;
-  rewriter_shuffle_2d.RegisterRewritePattern(shuffle, shuffle_2d_fusion);
-  rewriter_shuffle_2d.runOnGraph(graph, filter_shuffle_2d_fusion);
-}
+  auto filter_shuffle_2d_dynamic_fusion =
+      [](const Match& match,
+         const std::unordered_map<std::string, Value*>& vmap) {
+        const auto& match_vmap = match.values_map;
+
+        auto n_idx = getIValue("idx_0", match_vmap, vmap);
+        auto c_idx = getIValue("idx_1", match_vmap, vmap);
+        auto h_idx = getIValue("idx_2", match_vmap, vmap);
+        auto w_idx = getIValue("idx_3", match_vmap, vmap);
+        if (!n_idx.has_value() || !c_idx.has_value() || !h_idx.has_value() ||
+            !w_idx.has_value()) {
+          return false;
+        }
+
+        auto n_idx_ = n_idx.value().toInt();
+        auto c_idx_ = c_idx.value().toInt();
+        auto h_idx_ = h_idx.value().toInt();
+        auto w_idx_ = w_idx.value().toInt();
+
+        if ((n_idx_ != 0) || (c_idx_ != 1) || (h_idx_ != 2) || (w_idx_ != 3)) {
+          return false;
+        }
+
+        return true;
+      };
+
+  SubgraphRewriter rewriter_shuffle_2d_dynamic;
+  rewriter_shuffle_2d_dynamic.RegisterRewritePattern(
+      channelshuffle_with_dynamic_shape, shuffle_2d_fusion_with_dynamic_shape);
+  rewriter_shuffle_2d_dynamic.runOnGraph(
+      graph, filter_shuffle_2d_dynamic_fusion);
+  SubgraphRewriter rewriter_shuffle_2d_static;
+  rewriter_shuffle_2d_static.RegisterRewritePattern(
+      channelshuffle_with_static_shape, shuffle_2d_fusion_with_static_shape);
+  rewriter_shuffle_2d_static.runOnGraph(graph, filter_shuffle_2d_static_fusion);
+
+} // namespace graph_rewrite
 
 void FuseAddLayerNorm(std::shared_ptr<Graph>& graph) {
   std::string aten_add_layernorm = R"(
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_linear.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/graph_rewrite_linear.cpp
@@ -125,11 +125,12 @@ void insertPrePackedLinearOp(std::shared_ptr<Graph>& graph) {
 
 void fuseLinearWithEltwise(std::shared_ptr<Graph>& graph) {
   SubgraphRewriter rewriter_relu, rewriter_gelu, rewriter_silu,
-      rewriter_sigmoid, rewriter_swish;
+      rewriter_sigmoid, rewriter_swish, rewriter_tanh;
   std::array<std::string, 2> relu_operators = {"relu", "relu_"};
   std::array<std::string, 2> sigmoid_operators = {"sigmoid", "sigmoid_"};
   std::array<std::string, 2> silu_operators = {"silu", "silu_"};
   std::array<std::string, 2> mul_operators = {"mul", "mul_"};
+  std::array<std::string, 2> tanh_operators = {"tanh", "tanh_"};
 
   auto linear_relu_rstring = CodeTemplate(R"(
      graph(%input, %packed_weight):
@@ -142,6 +143,17 @@ void fuseLinearWithEltwise(std::shared_ptr<Graph>& graph) {
         %res = ipex_prepack::linear_relu_run(%input, %packed_weight)
         return (%res))";
 
+  auto linear_tanh_rstring = CodeTemplate(R"(
+    graph(%input, %packed_weight):    
+        %x = ipex_prepack::linear_run(%input, %packed_weight)
+        %res = aten::${tanh}(%x)
+        return (%res))");
+
+  std::string linear_tanh_fused = R"(
+    graph(%input, %packed_weight):
+        %res = ipex_prepack::linear_tanh_run(%input, %packed_weight)
+        return (%res))";
+
   std::string linear_gelu = R"(
     graph(%input, %approximate, %packed_weight):
         %x = ipex_prepack::linear_run(%input, %packed_weight)
@@ -189,6 +201,13 @@ void fuseLinearWithEltwise(std::shared_ptr<Graph>& graph) {
         linear_relu_rstring.format(env), linear_relu_fused);
   }
 
+  for (const auto& tanh : tanh_operators) {
+    TemplateEnv env;
+    env.s("tanh", tanh);
+    rewriter_tanh.RegisterRewritePattern(
+        linear_tanh_rstring.format(env), linear_tanh_fused);
+  }
+
   for (const auto& silu : silu_operators) {
     TemplateEnv env;
     env.s("silu", silu);
@@ -213,6 +232,7 @@ void fuseLinearWithEltwise(std::shared_ptr<Graph>& graph) {
   rewriter_gelu.RegisterRewritePattern(linear_gelu, linear_gelu_fused);
 
   rewriter_relu.runOnGraph(graph);
+  rewriter_tanh.runOnGraph(graph);
   rewriter_gelu.runOnGraph(graph);
 }
 
diff --git a/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp b/intel_extension_for_pytorch/csrc/jit/cpu/passes/register_dnnl_jit_ops.cpp
@@ -456,6 +456,24 @@ RegisterOperators op({
           };
         },
         aliasAnalysisFromSchema()),
+
+    Operator(
+        "ipex_prepack::linear_tanh_run(Tensor input, "
+        "__torch__.torch.classes.ipex_prepack.LinearOpContext W_prepack) "
+        "-> Tensor",
+        [](const Node* node) -> Operation {
+          return [](Stack* stack) {
+            auto result = linear_tanh_run(
+                (std::move(peek(stack, 0, 2))).toTensor(),
+                (std::move(peek(stack, 1, 2)))
+                    .toCustomClass<LinearOpContext>());
+            drop(stack, 2);
+            pack(stack, std::move(result));
+            return 0;
+          };
+        },
+        aliasAnalysisFromSchema()),
+
     Operator(
         "ipex_prepack::linear_sigmoid_run(Tensor input, "
         "__torch__.torch.classes.ipex_prepack.LinearOpContext W_prepack) "
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py