Add native_batch_norm

SS-JIA · facebook-github-bot · commit 69b4e75ac03c · 2023-07-31T02:53:24.000-07:00
Summary: Add implementation of native_batch_norm in portable kernels

Reviewed By: kimishpatel

Differential Revision: D47878889

fbshipit-source-id: 3be2221c6df04fd73810a189484f1290b2908aca
diff --git a/kernels/portable/cpu/op_native_batch_norm.cpp b/kernels/portable/cpu/op_native_batch_norm.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+#include <tuple>
+
+#include <executorch/kernels/portable/cpu/util/normalization_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+
+std::tuple<Tensor&, Tensor&, Tensor&> _native_batch_norm_legit_no_training_out(
+    RuntimeContext& ctx,
+    const Tensor& in,
+    const exec_aten::optional<Tensor>& weight,
+    const exec_aten::optional<Tensor>& bias,
+    const Tensor& running_mean,
+    const Tensor& running_var,
+    double momentum,
+    double eps,
+    Tensor& out,
+    Tensor& mean_out,
+    Tensor& var_out) {
+  (void)ctx;
+
+  ET_CHECK(resize_tensor(out, in.sizes()) == Error::Ok);
+
+  check_batch_norm_args(
+      in, weight, bias, running_mean, running_var, momentum, eps, out);
+  // For now, only support the default dim order
+  ET_CHECK(is_default_dim_order(in.dim_order().data(), in.dim_order().size()));
+
+  size_t C_dim = in.dim() >= 1 ? 1 : 0;
+  size_t C = in.size(C_dim);
+  size_t outer = getLeadingDims(in, C_dim);
+  size_t inner = getTrailingDims(in, C_dim);
+
+  ET_SWITCH_FLOAT_TYPES(
+      in.scalar_type(), ctx, "native_batch_norm_legit_no_training", CTYPE, [&] {
+        const CTYPE* in_data = in.const_data_ptr<CTYPE>();
+        CTYPE* out_data = out.mutable_data_ptr<CTYPE>();
+
+        const CTYPE* const mean_data = running_mean.const_data_ptr<CTYPE>();
+        const CTYPE* const var_data = running_var.const_data_ptr<CTYPE>();
+
+        for (size_t i = 0; i < outer; ++i) {
+          for (size_t c = 0; c < C; ++c) {
+            CTYPE mean = mean_data[c];
+            CTYPE var = var_data[c];
+            CTYPE invstd = 1.0 / std::sqrt(var + eps);
+            CTYPE weight_val = 1;
+            if (weight.has_value()) {
+              weight_val = weight.value().const_data_ptr<CTYPE>()[c];
+            }
+            CTYPE bias_val = 0;
+            if (bias.has_value()) {
+              bias_val = bias.value().const_data_ptr<CTYPE>()[c];
+            }
+            for (size_t j = 0; j < inner; ++j) {
+              *out_data = (*in_data - mean) * invstd * weight_val + bias_val;
+              out_data++;
+              in_data++;
+            }
+          }
+        }
+      });
+
+  return {out, mean_out, var_out};
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/targets.bzl b/kernels/portable/cpu/targets.bzl
@@ -505,6 +505,12 @@ _ATEN_OPS = (
             ":scalar_utils",
         ],
     ),
+    op_target(
+        name = "op_native_batch_norm",
+        deps = [
+            "//executorch/kernels/portable/cpu/util:normalization_ops_util",
+        ],
+    ),
     op_target(
         name = "op_native_layer_norm",
         deps = [
diff --git a/kernels/portable/cpu/util/normalization_ops_util.cpp b/kernels/portable/cpu/util/normalization_ops_util.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cstring>
+
+#include <executorch/runtime/core/exec_aten/exec_aten.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/core/exec_aten/util/tensor_util.h>
+#include <executorch/runtime/platform/assert.h>
+
+namespace torch {
+namespace executor {
+
+using Tensor = exec_aten::Tensor;
+
+void check_batch_norm_args(
+    const Tensor& in,
+    const exec_aten::optional<Tensor>& weight,
+    const exec_aten::optional<Tensor>& bias,
+    const Tensor& running_mean,
+    const Tensor& running_var,
+    double momentum,
+    double eps,
+    Tensor& out) {
+  // All tensors must be the same dtype
+  ET_CHECK_SAME_DTYPE3(in, running_mean, running_var);
+  ET_CHECK_SAME_DTYPE2(in, out);
+  if (weight.has_value()) {
+    ET_CHECK_SAME_DTYPE2(in, weight.value());
+  }
+  if (bias.has_value()) {
+    ET_CHECK_SAME_DTYPE2(in, bias.value());
+  }
+
+  size_t C_dim = in.dim() >= 1 ? 1 : 0;
+  // All parameter tensors must be of dim 1 and have length equal to the
+  // channels dim of in
+  ET_CHECK(running_mean.dim() == 1 && running_mean.size(0) == in.size(C_dim));
+  ET_CHECK(running_var.dim() == 1 && running_var.size(0) == in.size(C_dim));
+  if (weight.has_value()) {
+    ET_CHECK(
+        weight.value().dim() == 1 && weight.value().size(0) == in.size(C_dim));
+  }
+  if (bias.has_value()) {
+    ET_CHECK(bias.value().dim() == 1 && bias.value().size(0) == in.size(C_dim));
+  }
+}
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/normalization_ops_util.h b/kernels/portable/cpu/util/normalization_ops_util.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+
+void check_batch_norm_args(
+    const Tensor& in,
+    const exec_aten::optional<Tensor>& weight,
+    const exec_aten::optional<Tensor>& bias,
+    const Tensor& running_mean,
+    const Tensor& running_var,
+    double momentum,
+    double eps,
+    Tensor& out);
+
+} // namespace executor
+} // namespace torch
diff --git a/kernels/portable/cpu/util/targets.bzl b/kernels/portable/cpu/util/targets.bzl
@@ -50,6 +50,19 @@ def define_common_targets():
         visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."],
     )
 
+    runtime.cxx_library(
+        name = "normalization_ops_util",
+        srcs = ["normalization_ops_util.cpp"],
+        exported_headers = [
+            "normalization_ops_util.h",
+        ],
+        compiler_flags = ["-Wno-missing-prototypes"],
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+        ],
+        visibility = ["//executorch/kernels/portable/cpu/...", "//executorch/kernels/optimized/cpu/..."],
+    )
+
     runtime.cxx_library(
         name = "transpose_util",
         exported_headers = [
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
@@ -22,6 +22,11 @@
     - arg_meta: null
       kernel_name: torch::executor::log_softmax_out
 
+- op: _native_batch_norm_legit_no_training.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::_native_batch_norm_legit_no_training_out
+
 - op: _softmax.out
   kernels:
     - arg_meta: null
diff --git a/kernels/test/op_native_batch_norm_test.cpp b/kernels/test/op_native_batch_norm_test.cpp
diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl