Update and fix stack

SS-JIA · facebook-github-bot · commit 150b051c8831 · 2023-07-24T11:29:13.000-07:00
Summary: Clean up implementation of `aten::stack_out`, and allow it to handle input tensor list with different dtypes. Use [ATen stack impl](https://fburl.com/code/9phz8y5w) as a reference. Reviewed By: manuelcandales Differential Revision: D47556485 fbshipit-source-id: 85784ac142672e31fa355cf5a64767c2b4c16c98
diff --git a/kernels/portable/cpu/op_stack.cpp b/kernels/portable/cpu/op_stack.cpp
@@ -2,6 +2,7 @@
 
 #include <cstring>
 
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -10,127 +11,48 @@ namespace native {
 
 using Tensor = exec_aten::Tensor;
 
-namespace {
-
-// TODO(gasoonjia): Move this to a common spot so all implementation of
-// this operator can share it. (e.g., DSP-specific)
-/// Asserts that the parameters are valid.
-void check_stack_out_args(
-    exec_aten::ArrayRef<Tensor> tensors,
-    int64_t dim,
-    Tensor& out) {
-  // Stack expects non-empty tensor list
-  ET_CHECK_MSG(tensors.size() > 0, "Stack expects non-empty tensor list");
-
-  // Ensure dim is in range. Use `out` as a proxy for all input tensors, since
-  // they will all need to have the same number of dimensions besides the dim
-  // one.
-  ET_CHECK_MSG(
-      dim >= 0 && dim < out.dim(),
-      "dim %" PRId64 " out of range [0,%zd)",
-      dim,
-      out.dim());
-
-  for (size_t i = 0; i < tensors.size(); i++) {
-    // All input dtypes must match the output dtype.
-    ET_CHECK_MSG(
-        tensors[i].scalar_type() == out.scalar_type(),
-        "tensors[%zu] dtype %hhd != out dtype %hhd",
-        i,
-        tensors[i].scalar_type(),
-        out.scalar_type());
-
-    // All input tensors need to be of the same size
-    // Also, since we create a new axis in output for stacking, the output.dim()
-    // should be one larger than input.dim()
-    // https://pytorch.org/docs/stable/generated/torch.stack.html
-    ET_CHECK_MSG(
-        tensors[i].dim() == out.dim() - 1,
-        "tensors[%zu].dim() %zd != out.dim() - 1 %zd",
-        i,
-        tensors[i].dim(),
-        out.dim() - 1);
-
-    // The size of each input tensor should be the same. Here we use `out` as
-    // proxy for comparsion. Also, the size of output tensor should follow these
-    // rules:
-    // - For any input tensor, its size(i) == output.size(i) if i < dim, and its
-    //   size(i) == output.size(i+1) if i >= dim
-    // - For the cat dimension (output[dim]), its size should be the number of
-    //   input tensors
-    for (size_t d = 0; d < tensors[i].dim(); d++) {
-      if (d < dim) {
-        ET_CHECK_MSG(
-            tensors[i].size(d) == out.size(d),
-            "tensors[%zu].size(%zu) %zd != out.size(%zu) %zd | dim = %" PRId64,
-            i,
-            d,
-            tensors[i].size(d),
-            d,
-            out.size(d),
-            dim);
-      } else {
-        ET_CHECK_MSG(
-            tensors[i].size(d) == out.size(d + 1),
-            "tensors[%zu].size(%zu) %zd != out.size(%zu) %zd | dim = %" PRId64,
-            i,
-            d,
-            tensors[i].size(d),
-            d + 1,
-            out.size(d + 1),
-            dim);
-      }
-    }
-  }
-
-  // The size of the stack dimension of the output should be the number of
-  // input tensors
-  ET_CHECK_MSG(
-      out.size(dim) == tensors.size(),
-      "out.size(%" PRId64 ") %zd != number of input tensors %zu",
-      dim,
-      out.size(dim),
-      tensors.size());
-}
-} // namespace
-
-/// stack.out(Tensor[] tensors, int dim=0, *, Tensor(a!) out) -> Tensor(a!)
 Tensor& stack_out(
-    RuntimeContext& context,
+    RuntimeContext& ctx,
     exec_aten::ArrayRef<Tensor> tensors,
     int64_t dim,
     Tensor& out) {
-  (void)context;
-  // Support python-style negative indexing. E.g., for the shape {2, 3, 4},
-  // dim = -1 would refer to dim[2], dim = -2 would refer to dim[1], and so on.
+  (void)ctx;
+
   if (dim < 0) {
     dim += out.dim();
   }
 
-  // Assert that the args are valid.
-  check_stack_out_args(tensors, dim, out);
-
-  // If one tensor is empty tensor, all tensors are empty since they share same
-  // size. Under that, no need do anything. Just return the out.
-  if (tensors[0].numel() == 0) {
-    return out;
-  }
-
-  size_t leading_dim = getLeadingDims(out, dim);
-  size_t trailing_dim = getTrailingDims(out, dim);
-  size_t num_of_tensors = tensors.size();
-
-  size_t chunk_size = trailing_dim * out.element_size();
-
-  char* dst_ptr = out.data_ptr<char>();
-
-  for (int i = 0; i < leading_dim; i++) {
-    for (int j = 0; j < num_of_tensors; j++) {
-      char* src_ptr = tensors[j].data_ptr<char>() + chunk_size * i;
-      memcpy(dst_ptr, src_ptr, chunk_size);
-      dst_ptr += chunk_size;
+  check_stack_args(tensors, dim, out);
+
+  Tensor::SizesType expected_out_size[kTensorDimensionLimit];
+  size_t expected_out_dim = 0;
+  get_stack_out_target_size(tensors, dim, expected_out_size, &expected_out_dim);
+  ET_CHECK(
+      resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok);
+
+  const size_t outer = getLeadingDims(out, dim);
+  const size_t inner = getTrailingDims(out, dim);
+  const size_t ninputs = tensors.size();
+
+  const auto out_type = out.scalar_type();
+  ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, "stack", CTYPE_OUT, [&] {
+    CTYPE_OUT* out_ptr = out.mutable_data_ptr<CTYPE_OUT>();
+    for (size_t i = 0; i < outer; ++i) {
+      for (size_t j = 0; j < ninputs; ++j) {
+        const auto in_type = tensors[j].scalar_type();
+        ET_SWITCH_REAL_TYPES_AND(Bool, in_type, ctx, "stack", CTYPE_IN, [&] {
+          const CTYPE_IN* const in_ptr =
+              tensors[j].const_data_ptr<CTYPE_IN>() + i * inner;
+
+          for (size_t k = 0; k < inner; ++k) {
+            out_ptr[k] = static_cast<CTYPE_OUT>(in_ptr[k]);
+          }
+          out_ptr += inner;
+        });
+      }
     }
-  }
+  });
+
   return out;
 }
 
diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl
@@ -652,6 +652,9 @@ _ATEN_OPS = (
     ),
     op_target(
         name = "op_stack",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:copy_ops_util",
+        ],
     ),
     op_target(
         name = "op_sub",
diff --git a/kernels/portable/cpu/util/copy_ops_util.cpp b/kernels/portable/cpu/util/copy_ops_util.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) Meta Platforms, Inc. and affiliates.
+
+#include <cstring>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace torch {
+namespace executor {
+
+using Tensor = exec_aten::Tensor;
+
+void check_stack_args(
+    exec_aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out) {
+  // Ensure the input tensors list is non-empty
+  ET_CHECK(tensors.size() > 0);
+
+  // All input tensors need to be of the same size
+  // https://pytorch.org/docs/stable/generated/torch.stack.html
+  for (size_t i = 0; i < tensors.size(); i++) {
+    // All input dtypes must be castable to the output dtype.
+    ET_CHECK(canCast(tensors[i].scalar_type(), out.scalar_type()));
+
+    ET_CHECK(tensors[i].dim() == tensors[0].dim());
+    for (size_t d = 0; d < tensors[i].dim(); d++) {
+      ET_CHECK(tensors[i].size(d) == tensors[0].size(d));
+    }
+  }
+
+  // The output tensor will have a dimension inserted, so dim should be between
+  // 0 and ndim_of_inputs + 1
+  ET_CHECK(dim >= 0 && dim < tensors[0].dim() + 1);
+}
+
+void get_stack_out_target_size(
+    exec_aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor::SizesType* out_sizes,
+    size_t* out_ndim) {
+  *out_ndim = tensors[0].dim() + 1;
+
+  for (size_t d = 0; d < *out_ndim; ++d) {
+    if (d < dim) {
+      out_sizes[d] = tensors[0].size(d);
+    } else if (d == dim) {
+      out_sizes[d] = tensors.size();
+    } else {
+      out_sizes[d] = tensors[0].size(d - 1);
+    }
+  }
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/copy_ops_util.h b/kernels/portable/cpu/util/copy_ops_util.h
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+
+void check_stack_args(
+    exec_aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor& out);
+
+void get_stack_out_target_size(
+    exec_aten::ArrayRef<Tensor> tensors,
+    int64_t dim,
+    Tensor::SizesType* out_sizes,
+    size_t* out_ndim);
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
@@ -37,6 +37,19 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "copy_ops_util",
+        srcs = ["copy_ops_util.cpp"],
+        exported_headers = [
+            "copy_ops_util.h",
+        ],
+        compiler_flags = ["-Wno-missing-prototypes"],
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."],
+    )
+
     runtime.cxx_library(
         name = "transpose_util",
         exported_headers = [