native_layer_norm (for width dim) (#3001)

copyrightly · facebook-github-bot · commit 74576e83ddc4 · 2024-04-15T14:10:57.000-07:00
Summary: Pull Request resolved: #3001 We implement `native_layer_norm` which has 3 outputs - normalization of the input tensor according to the given `normalized_shape` - mean - 1/sqrt(var + eps) ``` func: native_layer_norm(Tensor input, SymInt[] normalized_shape, Tensor? weight, Tensor? bias, float eps) -> (Tensor, Tensor, Tensor) ``` According to SS-JIA's suggestion, a model specific implementation is more performant and preferred to a generic one. So we implemented the op in the following optimized way - our current use case has `normalized_shape` of len 1, namely we do the normalization through computing the mean and var at the last width dim - we do the computation in just one shader `native_layer_norm.glsl` without invoking the shaders to compute mean and var respectively - we use [Welford's online algorithm](https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm) to compute mean and variance in one pass Reviewed By: SS-JIA, jorgep31415 Differential Revision: D56005629 fbshipit-source-id: 096c2e2f04b95f1f5c9205c4827091169771978c
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -50,6 +50,8 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.sum.dim_IntList,
             # Convolution operators
             exir_ops.edge.aten.convolution.default,
+            # Normalization
+            exir_ops.edge.aten.native_layer_norm.default,
             # Other
             operator.getitem,
         ]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.glsl
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#include "broadcasting_utils.h"
+#include "indexing_utils.h"
+
+#define PRECISION ${PRECISION}
+#define VEC4_T ${texel_type(DTYPE)}
+#define to_tensor_idx to_tensor_idx_${PACKING}
+
+layout(std430) buffer;
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
+layout(set = 0, binding = 1, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_mean;
+layout(set = 0, binding = 2, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_rstd;
+
+layout(set = 0, binding = 3) uniform PRECISION sampler3D image_in;
+layout(set = 0, binding = 4) uniform PRECISION sampler3D weight_in;
+layout(set = 0, binding = 5) uniform PRECISION sampler3D bias_in;
+
+layout(set = 0, binding = 6) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_sizes;
+
+layout(set = 0, binding = 7) uniform PRECISION restrict Epsilon {
+  float data;
+}
+epsilon;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const ivec4 idx = to_tensor_idx(pos, out_sizes.data);
+
+  if (any(greaterThanEqual(idx, out_sizes.data))) {
+    return;
+  }
+
+  const int width = int(out_sizes.data.x);
+
+  VEC4_T mean = VEC4_T(0);
+  VEC4_T delta = VEC4_T(0);
+  VEC4_T delta2 = VEC4_T(0);
+  VEC4_T M2 = VEC4_T(0);
+
+  // Use Welford's online algorithm to compute mean and variance in one pass
+  // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Welford's_online_algorithm
+  for (int w = 0; w < width; ++w) {
+    VEC4_T v = texelFetch(image_in, ivec3(w, pos.y, pos.z), 0);
+    delta = v - mean;
+    mean += delta / (w + 1);
+    delta2 = v - mean;
+    M2 += delta * delta2;
+  }
+
+  VEC4_T var = M2 / width;
+  VEC4_T rstd = pow(var + epsilon.data, VEC4_T(-0.5));
+  VEC4_T offset = -rstd * mean;
+
+  for (int w = 0; w < width; ++w) {
+    VEC4_T v = texelFetch(image_in, ivec3(w, pos.y, pos.z), 0);
+    // broadcasting
+    VEC4_T weight = texelFetch(weight_in, ivec3(w, 0, 0), 0).xxxx;
+    VEC4_T bias = texelFetch(bias_in, ivec3(w, 0, 0), 0).xxxx;
+    VEC4_T outtex = (v * rstd + offset) * weight + bias;
+    imageStore(image_out, ivec3(w, pos.y, pos.z), outtex);
+  }
+
+  imageStore(image_mean, pos, mean);
+  imageStore(image_rstd, pos, rstd);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml b/backends/vulkan/runtime/graph/ops/glsl/native_layer_norm.yaml
@@ -0,0 +1,17 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+native_layer_norm:
+  parameter_names_with_default_values:
+    NDIM: 3
+    DTYPE: float
+    PACKING: C_packed
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: native_layer_norm
diff --git a/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp b/backends/vulkan/runtime/graph/ops/impl/NativeLayerNorm.cpp
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/KernelUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+std::vector<int64_t> calc_out_mean_sizes(
+    vTensor& self,
+    int64_t normalized_shape_dim) {
+  std::vector<int64_t> output_size = self.sizes();
+  int64_t self_dim = self.sizes().size();
+  for (int64_t i = 0; i < normalized_shape_dim; ++i) {
+    output_size.at(self_dim - i - 1) = 1;
+  }
+  return output_size;
+}
+
+void resize_native_layer_norm_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr mean = graph->get_tensor(args[0].refs[1]);
+  vTensorPtr rstd = graph->get_tensor(args[0].refs[2]);
+  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
+  std::vector<int64_t> in_sizes = in->sizes();
+
+  const auto normalized_shape_dim = graph->get_int_list(extra_args[0])->size();
+
+  std::vector<int64_t> mean_size =
+      calc_out_mean_sizes(*in, normalized_shape_dim);
+
+  out->virtual_resize(in_sizes);
+  mean->virtual_resize(mean_size);
+  rstd->virtual_resize(mean_size);
+}
+
+void check_args(const vTensor& in, const vTensor& out) {
+  VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked));
+}
+
+void add_native_layer_norm_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef normalized_shape,
+    const ValueRef weight,
+    const ValueRef bias,
+    const ValueRef eps,
+    const ValueRef out) {
+  const auto normalized_shape_dim =
+      graph.get_int_list(normalized_shape)->size();
+  if (normalized_shape_dim > 1) {
+    VK_THROW("native_layer_norm only supports normalized_shape with dim == 1");
+  }
+
+  if (graph.val_is_none(weight)) {
+    VK_THROW("native_layer_norm requires weight to be non-None");
+  }
+
+  if (graph.val_is_none(bias)) {
+    VK_THROW("native_layer_norm requires bias to be non-None");
+  }
+
+  ValueRef arg_in = prepack_if_tensor_ref(graph, in);
+  ValueRef arg_weight =
+      prepack_if_tensor_ref(graph, weight, graph.memory_layout_of(arg_in));
+  ValueRef arg_bias =
+      prepack_if_tensor_ref(graph, bias, graph.memory_layout_of(arg_in));
+
+  const auto out_val = graph.get_value_list(out);
+  vTensorPtr t_out = graph.get_tensor(out_val->at(0));
+  vTensorPtr t_mean = graph.get_tensor(out_val->at(1));
+  vTensorPtr t_input = graph.get_tensor(in);
+  float epsilon = graph.extract_scalar<float>(eps);
+
+  check_args(*t_input, *t_out);
+
+  std::vector<int64_t> in_sizes = t_input->sizes();
+
+  api::utils::uvec3 global_size = t_mean->extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  std::string kernel_name("native_layer_norm");
+  kernel_name.reserve(kShaderNameReserve);
+
+  add_dtype_suffix(kernel_name, *t_out);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{{out_val->at(0), out_val->at(1), out_val->at(2)},
+        api::MemoryAccessType::WRITE},
+       {{arg_in, arg_weight, arg_bias}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {t_out->gpu_sizes_ubo(), graph.create_params_buffer(epsilon)},
+      // Resizing
+      resize_native_layer_norm_node,
+      {normalized_shape}));
+}
+
+void native_layer_norm(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_native_layer_norm_node(
+      graph, args[0], args[1], args[2], args[3], args[4], args[5]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.native_layer_norm.default, native_layer_norm);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -118,6 +118,18 @@ def get_conv2d_inputs():
     return test_suite
 
 
+def get_native_layer_norm_inputs():
+    test_suite = VkTestSuite(
+        [
+            ((S1, S2), [S2], (S2), (S2), 0.001),
+            ((M, M1, M2), [M2], (M2), (M2), 0.001),
+            ((L, XL, M1, M2), [M2], (M2), (M2), 0.001),
+        ]
+    )
+    test_suite.supports["layouts"] = ["api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED"]
+    return test_suite
+
+
 test_suites = {
     "aten.add.Tensor": get_binary_elementwise_inputs(),
     "aten.sub.Tensor": get_binary_elementwise_inputs(),
@@ -126,6 +138,7 @@ def get_conv2d_inputs():
     "aten.mm.default": get_mm_inputs(),
     "aten.max_pool2d_with_indices.default": get_pool2d_inputs(),
     "aten.convolution.default": get_conv2d_inputs(),
+    "aten.native_layer_norm.default": get_native_layer_norm_inputs(),
 }
 
 prepacked_args = {"aten.mm.default": {"mat2"}}
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
@@ -15,10 +15,12 @@
     AT_TENSOR_OPT,
     BOOL,
     CppTestFileGen,
+    DOUBLE,
     INT,
-    TENSOR_TUPLE,
     TestSuite,
     TestSuiteGen,
+    THREE_TENSOR_TUPLE,
+    TWO_TENSOR_TUPLE,
 )
 from torchgen.api import cpp
 from torchgen.api.types import CppSignatureGroup
@@ -118,7 +120,7 @@ def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite):
             self.refs["out"] = ValueRef(
                 name="out_ref", src_cpp_name="out", src_cpp_type=ret_type, is_out=True
             )
-        elif ret_type == TENSOR_TUPLE:
+        elif ret_type == TWO_TENSOR_TUPLE:
             self.refs["out"] = [
                 ValueRef(
                     name="out_ref_first",
@@ -139,6 +141,33 @@ def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite):
                     is_out=False,
                 ),
             ]
+        elif ret_type == THREE_TENSOR_TUPLE:
+            self.refs["out"] = [
+                ValueRef(
+                    name="out_ref_first",
+                    src_cpp_name="std::get<0>(out)",
+                    src_cpp_type="at::Tensor",
+                    is_out=True,
+                ),
+                ValueRef(
+                    name="out_ref_second",
+                    src_cpp_name="std::get<1>(out)",
+                    src_cpp_type="at::Tensor",
+                    is_out=True,
+                ),
+                ValueRef(
+                    name="out_ref_third",
+                    src_cpp_name="std::get<2>(out)",
+                    src_cpp_type="at::Tensor",
+                    is_out=True,
+                ),
+                ValueRef(
+                    name="out_ref",
+                    src_cpp_name="out",
+                    src_cpp_type=ret_type,
+                    is_out=False,
+                ),
+            ]
 
     ## ATen code generation
 
@@ -210,8 +239,12 @@ def create_value_for(self, ref: ValueRefList) -> str:  # noqa: C901
             ret_str += f"add_scalar<bool>({ref.src_cpp_name}); \n"
         elif ref.src_cpp_type == INT:
             ret_str += f"add_scalar<int64_t>({ref.src_cpp_name}); \n"
-        elif ref.src_cpp_type == TENSOR_TUPLE:
+        elif ref.src_cpp_type == DOUBLE:
+            ret_str += f"add_scalar<double>({ref.src_cpp_name}); \n"
+        elif ref.src_cpp_type == TWO_TENSOR_TUPLE:
             ret_str += f"add_value_list({{{ref.name}_first, {ref.name}_second}}); \n"
+        elif ref.src_cpp_type == THREE_TENSOR_TUPLE:
+            ret_str += f"add_value_list({{{ref.name}_first, {ref.name}_second, {ref.name}_third}}); \n"
         else:
             raise RuntimeError(f"Unsupported cpp type {ref.src_cpp_type}")
 
@@ -441,9 +474,9 @@ def gen_parameterization(self) -> str:
 }
 
 #ifdef USE_VULKAN_FP16_INFERENCE
-bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-3) {
+bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-2, float atol=1e-2) {
 #else
-bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-8) {
+bool check_close(at::Tensor& t1, at::Tensor& t2, float rtol=1e-5, float atol=1e-5) {
 #endif
     // Skip checking index tensors
     if (t1.scalar_type() == at::kLong || t2.scalar_type() == at::kLong) {
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py
@@ -21,7 +21,9 @@
 AT_TENSOR_OPT = "::std::optional<at::Tensor>"
 BOOL = "bool"
 INT = "int64_t"
-TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
+DOUBLE = "double"
+TWO_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor>"
+THREE_TENSOR_TUPLE = "::std::tuple<at::Tensor,at::Tensor,at::Tensor>"
 
 ###########################
 ## Test Suite definition ##
@@ -131,6 +133,8 @@ def create_input_data(self, arg: Argument, data: Any) -> str:
             ret_str += f"{str(data).lower()};"
         elif cpp_type == INT:
             ret_str += f"{str(data).lower()};"
+        elif cpp_type == DOUBLE:
+            ret_str += f"{str(data).lower()};"
         else:
             raise RuntimeError(f"Unsupported cpp type {cpp_type}")
         return ret_str + "\n"
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -647,3 +647,21 @@ def forward(self, x):
             sample_inputs,
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
+
+    def test_vulkan_backend_native_layer_norm(self):
+        class NativeLayerNormModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+
+            def forward(self, x):
+                return torch.native_layer_norm(
+                    x, [5], torch.ones(5), torch.zeros(5), 1e-5
+                )
+
+        sample_inputs = (torch.randn(size=(3, 4, 5), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(
+            NativeLayerNormModule(),
+            sample_inputs,
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )

Original file line number	Diff line number	Diff line change
`@@ -50,6 +50,8 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:`
`50`	`50`	`exir_ops.edge.aten.sum.dim_IntList,`
`51`	`51`	`# Convolution operators`
`52`	`52`	`exir_ops.edge.aten.convolution.default,`
	`53`	`+ # Normalization`
	`54`	`+ exir_ops.edge.aten.native_layer_norm.default,`
`53`	`55`	`# Other`
`54`	`56`	`operator.getitem,`
`55`	`57`	`]`