[ET-VK][Ops] aten.embedding

jorgep31415 · jorgep31415 · commit c072a182a762 · 2024-05-28T18:59:33.000-07:00
## The Operator `nn.Module` invocations on the embedding returned by [`torch.nn.Embedding`](https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html) get compiled to `aten.embedding.default` in the Edge Dialect, which carries the following signature. ``` - func: embedding(Tensor weight, Tensor indices, SymInt padding_idx=-1, bool scale_grad_by_freq=False, bool sparse=False) -> Tensor ``` ## Implementation This is a C-packing-only implementation. Interestingly, the 1D-`indices` case is equivalent to the `dim=0` case of the preceding `aten.index_select`: #3744 ``` - func: index_select(Tensor self, int dim, Tensor index) -> Tensor ``` I naïvely thought the rest of the operator would be similarly easy but it wasn't. The 2D and 3D-`indices` cases are more involved to the extent that we require a standalone `cpp`/`glsl` file. ## Codegen We add support for making 2D and 3D index tensors. This requires new generation functions as well as renaming of the `case_name` string to recursively handle list `pylist`s. ``` // 1D Test(weight=[10, 9], indices=[0, 2]), // 2D Test(weight=[10, 9], indices=[[0, 2], [1, 4], [7, 7]]), // 3D Test(weight=[10, 9], indices=[[[3, 1, 4], [1, 5, 9]], [[2, 6, 5], [3, 5, 8]]]), ``` Differential Revision: [D57880520](https://our.internmc.facebook.com/intern/diff/D57880520/) ghstack-source-id: 228038402 Pull Request resolved: #3762
diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
@@ -99,6 +99,7 @@ def __contains__(self, op):
 ]
 
 INDEXING_OPS = [
+    exir_ops.edge.aten.embedding.default,
     exir_ops.edge.aten.index_select.default,
     exir_ops.edge.aten.select_copy.int,
     exir_ops.edge.aten.slice_copy.Tensor,
diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_type(DTYPE)}
+
+layout(std430) buffer;
+
+#include "indexing_utils.h"
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(1, "r", "t_in", "int", STORAGE)}
+${layout_declare_tensor(2, "r", "t_weight", DTYPE, STORAGE)}
+${layout_declare_ubo(3, "ivec4", "sizes")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int packed_dim = C_DIM;
+
+void main() {
+  const ivec3 out_pos = ivec3(gl_GlobalInvocationID);
+
+  if (pos_out_of_bounds(out_pos, sizes, packed_dim)) {
+    return;
+  }
+
+  const ivec4 out_idx = to_tensor_idx(out_pos, sizes, packed_dim);
+  VEC4_T out_texel;
+
+  // Consider optimizing via W-packing format for t_in and t_weight.
+  for (int i = 0; i < 4; ++i) {
+    // Read input tensor for embedding index.
+    const ivec3 in_pos = ivec3(out_pos.y, out_idx.z * 4 + i, out_idx.w / 4);
+    const int in_texel = texelFetch(t_in, in_pos, 0)[out_idx.w % 4];
+
+    // Read weight tensor for embedding.
+    out_texel[i] = texelFetch(t_weight, ivec3(out_pos.x, in_texel, 0), 0).x;
+  }
+
+  imageStore(t_out, out_pos, out_texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.yaml b/backends/vulkan/runtime/graph/ops/glsl/embedding.yaml
@@ -0,0 +1,12 @@
+embedding:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+      - VALUE: int
+  shader_variants:
+    - NAME: embedding
diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/DimUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void check_embedding_args(
+    const vTensor& weight,
+    const vTensor& in,
+    const vTensor& out) {
+  VK_CHECK_COND(check_memory_layout_is(weight, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(in, api::kChannelsPacked));
+  VK_CHECK_COND(check_memory_layout_is(out, api::kChannelsPacked));
+}
+
+void add_embedding_node(
+    ComputeGraph& graph,
+    ValueRef weight,
+    ValueRef in,
+    ValueRef out) {
+  vTensorPtr t_weight = graph.get_tensor(weight);
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+
+  check_embedding_args(*t_weight, *t_in, *t_out);
+
+  std::string kernel_name = "embedding";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, *t_out);
+
+  api::utils::uvec3 global_size = t_out->image_extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      local_size,
+      {{out, api::MemoryAccessType::WRITE},
+       {{in, weight}, api::MemoryAccessType::READ}},
+      {t_out->sizes_ubo()}));
+}
+
+void embedding(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  ValueRef weight = prepack_if_tensor_ref(graph, args[0]);
+  ValueRef in = prepack_if_tensor_ref(graph, args[1]);
+  ValueRef out = args[5];
+
+  add_embedding_node(graph, weight, in, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.embedding.default, embedding);
+}
+
+} // namespace vkcompute
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -423,6 +423,25 @@ def get_index_select_inputs():
     return test_suite
 
 
+def get_embedding_inputs():
+    Test = namedtuple("VkEmbeddingTest", ["weight", "indices"])
+    Test.__new__.__defaults__ = (None, None)
+
+    test_cases = [
+        Test(weight=[10, 9], indices=[0, 2]),
+        Test(weight=[10, 9], indices=[2, 3, 4, 5, 7]),
+        Test(weight=[10, 9], indices=[[0, 2], [1, 4], [7, 7]]),
+        Test(weight=[10, 9], indices=[[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]),
+        Test(weight=[10, 9], indices=[[[3, 1, 4], [1, 5, 9]], [[2, 6, 5], [3, 5, 8]]]),
+    ]
+
+    test_suite = VkTestSuite([tuple(tc) + (-1, "false", "false") for tc in test_cases])
+
+    test_suite.dtypes = ["at::kFloat"]
+    test_suite.layouts = ["api::kChannelsPacked"]
+    return test_suite
+
+
 def get_unsqueeze_inputs():
     test_suite = VkTestSuite(
         [
@@ -817,6 +836,7 @@ def get_gelu_inputs():
     "aten.slice_copy.Tensor": get_slice_inputs(),
     "aten.slice.Tensor": get_slice_inputs(),
     "aten.index_select.default": get_index_select_inputs(),
+    "aten.embedding.default": get_embedding_inputs(),
     "aten.unsqueeze_copy.default": get_unsqueeze_inputs(),
     "aten.clone.default": get_clone_inputs(),
     "aten.repeat.default": get_repeat_inputs(),
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py
@@ -78,11 +78,14 @@ def init_list_str(pylist: Any) -> str:
     if not isinstance(pylist, (list, tuple)):
         pylist = [pylist]
 
-    init_list_str = "{"
+    list_str = "{"
     for s in pylist:
-        init_list_str += f"{s}, "
-    init_list_str = init_list_str[:-2] + "}"
-    return init_list_str
+        if isinstance(s, (list, tuple)):
+            list_str += f"{init_list_str(s)}, "
+        else:
+            list_str += f"{s}, "
+    list_str = list_str[:-2] + "}"
+    return list_str
 
 
 def get_or_return_default(arg: Argument, inputs: List[Any], i: int):
@@ -105,8 +108,17 @@ def __init__(self, f: NativeFunction, test_suite: TestSuite):
             self.f, method=False, fallback_binding=self.f.manual_cpp_binding
         ).most_faithful_signature()
 
-    def gen_case_name_tuple(self, t: Tuple) -> str:
-        return "x".join([str(e) for e in t])
+    def gen_case_name_tuple(self, t) -> str:
+        return "x".join(
+            [
+                (
+                    str(e)
+                    if not isinstance(e, (list, tuple))
+                    else self.gen_case_name_tuple(e)
+                )
+                for e in t
+            ]
+        )
 
     def gen_case_name(self, inputs: List[Any], prepack: bool = False) -> str:
         name_str = self.op_name
@@ -119,7 +131,7 @@ def gen_case_name(self, inputs: List[Any], prepack: bool = False) -> str:
             elif isinstance(arg_sizes_or_val, list):
                 lst = []
                 for size in arg_sizes_or_val:
-                    if isinstance(size, tuple):
+                    if isinstance(size, (list, tuple)):
                         lst.append(self.gen_case_name_tuple(size))
                     else:
                         lst.append(str(size))
@@ -154,7 +166,7 @@ def create_input_data(self, arg: Argument, data: Any) -> str:  # noqa: C901
             ret_str = f"{cpp_type} {arg.name} = "
 
         if cpp_type == AT_TENSOR:
-            if arg.name == "index":
+            if arg.name == "index" or arg.name == "indices":
                 ret_str += f"make_index_tensor({init_list_str(data)});"
             else:
                 ret_str += (
@@ -283,19 +295,52 @@ def generate_suite_cpp(self) -> str:
     values[i] = (float) i;
   }}
 
-  // from_blob doesn't take ownership of data. Hence must create a copy as
-  // "values" will go out of scope.
+  // Clone as original data will be deallocated upon return.
   return at::from_blob(values.data(), sizes, at::kFloat).toType(dtype).detach().clone();
 }}
 
 
 at::Tensor make_index_tensor(std::vector<int64_t> indices) {{
-  int64_t size = static_cast<int64_t>(indices.size());
   at::ScalarType dtype = at::kInt;
+  std::vector<int64_t> sizes = {{static_cast<int64_t>(indices.size())}};
+
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(indices.data(), sizes, dtype).detach().clone();
+}}
+
+at::Tensor make_index_tensor(std::vector<std::vector<int64_t>> indices) {{
+  at::ScalarType dtype = at::kInt;
+  std::vector<int64_t> sizes = {{
+    static_cast<int64_t>(indices.size()),
+    static_cast<int64_t>(indices[0].size())}};
+
+  // Flatten indices as from_blob reads garbage otherwise.
+  std::vector<int64_t> acc;
+  for (auto& vec: indices) {{
+    acc.insert(acc.end(), vec.begin(), vec.end());
+  }}
+
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(acc.data(), sizes, dtype).detach().clone();
+}}
+
+at::Tensor make_index_tensor(std::vector<std::vector<std::vector<int64_t>>> indices) {{
+  at::ScalarType dtype = at::kInt;
+  std::vector<int64_t> sizes = {{
+    static_cast<int64_t>(indices.size()),
+    static_cast<int64_t>(indices[0].size()),
+    static_cast<int64_t>(indices[0][0].size())}};
+
+  // Flatten indices as from_blob reads garbage otherwise.
+  std::vector<int64_t> acc;
+  for (auto& v: indices) {{
+    for (auto& vv: v) {{
+      acc.insert(acc.end(), vv.begin(), vv.end());
+    }}
+  }}
 
-  // from_blob doesn't take ownership of data. Hence must create a copy as
-  // "values" will go out of scope.
-  return at::from_blob(indices.data(), {{size}}, dtype).detach().clone();
+  // Clone as original data will be deallocated upon return.
+  return at::from_blob(acc.data(), sizes, dtype).detach().clone();
 }}
 
 {test_suites_cpp}
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -1291,3 +1291,48 @@ def forward(self, x):
             sample_inputs,
             memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
         )
+
+    def test_vulkan_backend_embedding_1d(self):
+        class EmbeddingModule(torch.nn.Module):
+            def __init__(self, embedding):
+                super().__init__()
+                self.embedding = embedding
+
+            def forward(self, x):
+                return self.embedding(x)
+
+        self.lower_module_and_test_output(
+            EmbeddingModule(torch.nn.Embedding(4, 5)),
+            (torch.tensor([0, 1, 0, 4, 2, 0], dtype=torch.int32),),
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_embedding_2d(self):
+        class EmbeddingModule(torch.nn.Module):
+            def __init__(self, embedding):
+                super().__init__()
+                self.embedding = embedding
+
+            def forward(self, x):
+                return self.embedding(x)
+
+        self.lower_module_and_test_output(
+            EmbeddingModule(torch.nn.Embedding(4, 5)),
+            (torch.tensor([[0, 1, 0], [4, 2, 0]], dtype=torch.int32),),
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )
+
+    def test_vulkan_backend_embedding_3d(self):
+        class EmbeddingModule(torch.nn.Module):
+            def __init__(self, embedding):
+                super().__init__()
+                self.embedding = embedding
+
+            def forward(self, x):
+                return self.embedding(x)
+
+        self.lower_module_and_test_output(
+            EmbeddingModule(torch.nn.Embedding(4, 5)),
+            (torch.tensor([[[0, 1], [0, 1]], [[4, 2], [3, 3]]], dtype=torch.int32),),
+            memory_layouts=[vk_graph_schema.VkMemoryLayout.TENSOR_CHANNELS_PACKED],
+        )

Original file line number	Diff line number	Diff line change
`@@ -99,6 +99,7 @@ def __contains__(self, op):`
`99`	`99`	`]`
`100`	`100`
`101`	`101`	`INDEXING_OPS = [`
	`102`	`+ exir_ops.edge.aten.embedding.default,`
`102`	`103`	`exir_ops.edge.aten.index_select.default,`
`103`	`104`	`exir_ops.edge.aten.select_copy.int,`
`104`	`105`	`exir_ops.edge.aten.slice_copy.Tensor,`