Add matmul operator (#2517)

SS-JIA · facebook-github-bot · commit 5f133b305cfa · 2024-03-19T18:16:04.000-07:00
Summary: Pull Request resolved: #2517 ## Context Add matrix multiplication operator support. ghstack-source-id: 219281501 Reviewed By: jorgep31415 Differential Revision: D55031043 fbshipit-source-id: d7f5a3ff1e421602e75ec1904043ca07681a3b35
diff --git a/backends/vulkan/partitioner/vulkan_partitioner.py b/backends/vulkan/partitioner/vulkan_partitioner.py
@@ -26,17 +26,19 @@
 class VulkanSupportedOperators(OperatorSupportBase):
     def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         supported = node.op == "call_function" and node.target in [
-            # BinaryOp
+            # Binary arithmetic operators
             exir_ops.edge.aten.add.Tensor,
             exir_ops.edge.aten.sub.Tensor,
             exir_ops.edge.aten.mul.Tensor,
             exir_ops.edge.aten.div.Tensor,
             exir_ops.edge.aten.div.Tensor_mode,
             exir_ops.edge.aten.pow.Tensor_Tensor,
-            # Clamp
+            # Activation operators
             exir_ops.edge.aten.clamp.default,
             exir_ops.edge.aten.hardtanh.default,
             exir_ops.edge.aten.relu.default,
+            # Matrix multiplication operators
+            exir_ops.edge.aten.mm.default,
             # Other
             operator.getitem,
         ]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+#define DIVUP4(x) ((x + 3) / 4)
+
 #define PACKED_DIM_CHANNELS_PACKED(vec) vec.z
 
 #define PACKED_DIM_WIDTH_PACKED(vec) vec.x
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul.glsl
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#include "indexing_utils.h"
+
+#define PRECISION ${PRECISION}
+
+layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
+layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
+layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
+
+layout(set = 0, binding = 3) uniform PRECISION restrict OutExtents {
+  uvec4 data;
+}
+out_extents;
+
+layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
+  ivec4 data;
+}
+in_sizes;
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_extents.data.xyz))) {
+    return;
+  }
+
+  vec4 texel = vec4(0);
+
+  ivec3 mat1_pos = ivec3(0, pos.y, pos.z);
+
+  $if MAT2_PACKING == "HEIGHT_PACKED":
+    ivec3 mat2_pos = ivec3(pos.x * 4, 0, pos.z);
+  $else:
+    ivec3 mat2_pos = ivec3(pos.x, 0, pos.z);
+
+  $if MAT1_PACKING == "WIDTH_PACKED":
+    int K = DIVUP4(in_sizes.data[0]);
+    for (int i = 0; i < K; ++i) {
+      $if MAT2_PACKING == "HEIGHT_PACKED":
+        vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0);
+        vec4 sums = vec4(
+            dot(mat1_tex, texelFetch(im_mat2, mat2_pos, 0)),
+            dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(1, 0, 0), 0)),
+            dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(2, 0, 0), 0)),
+            dot(mat1_tex, texelFetch(im_mat2, mat2_pos + ivec3(3, 0, 0), 0)));
+
+        texel += sums;
+
+        mat1_pos.x++;
+        mat2_pos.y++;
+      $elif MAT2_PACKING == "WIDTH_PACKED":
+        vec4 mat1_tex = texelFetch(im_mat1, mat1_pos, 0);
+        texel = fma(mat1_tex.xxxx, texelFetch(im_mat2, mat2_pos, 0), texel);
+        mat2_pos.y++;
+        texel = fma(mat1_tex.yyyy, texelFetch(im_mat2, mat2_pos, 0), texel);
+        mat2_pos.y++;
+        texel = fma(mat1_tex.zzzz, texelFetch(im_mat2, mat2_pos, 0), texel);
+        mat2_pos.y++;
+        texel = fma(mat1_tex.wwww, texelFetch(im_mat2, mat2_pos, 0), texel);
+        mat2_pos.y++;
+
+        mat1_pos.x++;
+      $else:
+        $raise Exception("Unsupported value for MAT2_PACKING")
+    }
+  $elif MAT1_PACKING == "CHANNELS_PACKED" and MAT2_PACKING == "CHANNELS_PACKED":
+    int K = in_sizes.data[0];
+    for (int i = 0; i < K; ++i) {
+      texel = fma(
+          texelFetch(im_mat1, mat1_pos, 0),
+          texelFetch(im_mat2, mat2_pos, 0),
+          texel);
+
+      mat1_pos.x++;
+      mat2_pos.y++;
+    }
+  $else:
+    $raise Exception("Unsupported value combo for MAT1_PACKING and MAT2_PACKING")
+
+  imageStore(im_out, pos, texel);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul.yaml
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+matmul:
+  parameter_names_with_default_values:
+    DTYPE: float
+    NDIM: 3
+    MAT1_PACKING: WIDTH_PACKED
+    MAT2_PACKING: HEIGHT_PACKED
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+        SUFFIX: float
+      - VALUE: half
+        SUFFIX: half
+  shader_variants:
+    - NAME: matmul_W_packed_H_packed
+    - NAME: matmul_W_packed_W_packed
+      MAT2_PACKING: WIDTH_PACKED
+    - NAME: matmul_C_packed_C_packed
+      MAT1_PACKING: CHANNELS_PACKED
+      MAT2_PACKING: CHANNELS_PACKED
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/ScalarUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace at {
+namespace native {
+namespace vulkan {
+
+void check_matmul_args(
+    const vTensor& mat1,
+    const vTensor& mat2,
+    const vTensor& out) {
+  VK_CHECK_COND(check_ndim_is(mat1, 2) || check_ndim_is(mat1, 3));
+  VK_CHECK_COND(check_same_ndim(mat1, mat2));
+
+  VK_CHECK_COND(
+      check_memory_layout_is(
+          mat1, api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED) ||
+      check_memory_layout_is(mat1, api::GPUMemoryLayout::TENSOR_WIDTH_PACKED));
+  VK_CHECK_COND(check_same_memory_layout(mat1, out));
+
+  VK_CHECK_COND(check_same_sizes_at(mat1, -1, mat2, -2));
+}
+
+void resize_matmul_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensor& out = graph->get_val(args[0].refs[0]).toTensor();
+  vTensor& mat1 = graph->get_val(args[1].refs[0]).toTensor();
+  vTensor& mat2 = graph->get_val(args[1].refs[1]).toTensor();
+
+  std::vector<int64_t> new_out_sizes(3);
+  if (mat1.sizes().size() == 2) {
+    new_out_sizes.resize(2);
+    new_out_sizes.at(0) = mat1.sizes().at(0);
+    new_out_sizes.at(1) = mat2.sizes().at(1);
+  } else {
+    new_out_sizes.at(0) = mat1.sizes().at(0);
+    new_out_sizes.at(1) = mat1.sizes().at(1);
+    new_out_sizes.at(2) = mat2.sizes().at(2);
+  }
+
+  out.virtual_resize(new_out_sizes);
+}
+
+void add_matmul_node(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2,
+    const ValueRef out) {
+  ValueRef arg1 = prepack_if_tensor_ref(
+      graph, mat1, api::GPUMemoryLayout::TENSOR_WIDTH_PACKED);
+
+  api::GPUMemoryLayout mat2_layout = graph.memory_layout_of(arg1) ==
+          api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED
+      ? api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED
+      : api::GPUMemoryLayout::TENSOR_HEIGHT_PACKED;
+
+  ValueRef arg2 = prepack_if_tensor_ref(graph, mat2, mat2_layout);
+
+  vTensor& t_mat1 = graph.get_val(arg1).toTensor();
+  vTensor& t_mat2 = graph.get_val(arg2).toTensor();
+  vTensor& t_out = graph.get_val(out).toTensor();
+
+  check_matmul_args(t_mat1, t_mat2, t_out);
+
+  api::utils::uvec3 global_size = t_out.virtual_extents();
+  api::utils::uvec3 local_size = adaptive_work_group_size(global_size);
+
+  std::stringstream kernel_name;
+  kernel_name << "matmul";
+  apply_memory_layout_suffix(kernel_name, t_mat1);
+  apply_memory_layout_suffix(kernel_name, t_mat2);
+  apply_dtype_suffix(kernel_name, t_out);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name.str()),
+      global_size,
+      local_size,
+      // Inputs and Outputs
+      {{out, api::MemoryAccessType::WRITE},
+       {{arg1, arg2}, api::MemoryAccessType::READ}},
+      // Shader params buffers
+      {t_out.extents_ubo(), t_mat1.cpu_sizes_ubo()},
+      // Resizing
+      resize_matmul_node));
+}
+
+void matmul(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_matmul_node(graph, args[0], args[1], args[2]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.mm.default, matmul);
+}
+
+} // namespace vulkan
+} // namespace native
+} // namespace at
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.cpp
@@ -37,8 +37,29 @@ std::vector<int64_t> calculate_broadcasted_output_size(
 // Tensor property checking functions
 //
 
+bool check_ndim_is(const vTensor& t, size_t ndim) {
+  return t.sizes().size() == ndim;
+}
+
+bool check_same_sizes_at(
+    const vTensor& t1,
+    const int64_t d1,
+    const vTensor& t2,
+    const int64_t d2) {
+  return api::utils::val_at(d1, t1.sizes()) ==
+      api::utils::val_at(d2, t2.sizes());
+}
+
+bool check_memory_layout_is(const vTensor& t, api::GPUMemoryLayout layout) {
+  return t.gpu_memory_layout() == layout;
+}
+
+bool check_same_ndim(const vTensor& t1, const vTensor& t2) {
+  return t1.sizes().size() == t2.sizes().size();
+}
+
 bool check_same_memory_layout(const vTensor& t1, const vTensor& t2) {
-  return (t1.gpu_memory_layout() == t2.gpu_memory_layout());
+  return t1.gpu_memory_layout() == t2.gpu_memory_layout();
 }
 
 bool check_same_memory_layout(
diff --git a/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h b/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h
@@ -28,6 +28,18 @@ std::vector<int64_t> calculate_broadcasted_output_size(
 // Tensor property checking functions
 //
 
+bool check_ndim_is(const vTensor& t, size_t ndim);
+
+bool check_same_ndim(const vTensor& t1, const vTensor& t2);
+
+bool check_same_sizes_at(
+    const vTensor& t1,
+    int64_t d1,
+    const vTensor& t2,
+    int64_t d2);
+
+bool check_memory_layout_is(const vTensor& t, api::GPUMemoryLayout layout);
+
 bool check_same_memory_layout(const vTensor& t1, const vTensor& t2);
 
 bool check_same_memory_layout(
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -351,3 +351,17 @@ def forward(self, x1, x2):
         self.lower_module_and_test_output(
             model, sample_inputs, dynamic_shapes=dynamic_shapes, test_inputs=test_inputs
         )
+
+    def test_vulkan_backend_matmul(self):
+        class MatMulModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.weight = torch.ones(size=(63, 22), dtype=torch.float32)
+
+            def forward(self, x):
+                return torch.matmul(x, self.weight)
+
+        module = MatMulModule()
+        sample_inputs = (torch.ones(size=(31, 63), dtype=torch.float32),)
+
+        self.lower_module_and_test_output(module, sample_inputs)
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp