[ET-VK] Add buffer implementation for matrix multiplication

SS-JIA · web-flow · commit 4afc4fb45c58 · 2024-08-23T12:37:59.000-07:00
Differential Revision: D61666461 Pull Request resolved: #4845
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -368,7 +368,7 @@ utils::uvec3 ComputeGraph::create_local_wg_size(
 }
 
 utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
-  return create_local_wg_size(image_extents_of(idx));
+  return create_local_wg_size(create_global_wg_size(idx));
 }
 
 void ComputeGraph::copy_into_staging(
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.h b/backends/vulkan/runtime/graph/ComputeGraph.h
@@ -186,6 +186,21 @@ class ComputeGraph final {
 
   std::vector<int64_t> sizes_of(const ValueRef idx) const;
 
+  /*
+   * Returns the size of the tensor at `idx` along the specified dimension.
+   * Negative indexing is allowed.
+   */
+  template <typename T>
+  T size_at(const int64_t dim, const ValueRef idx) const {
+    const Value& val = values_.at(idx);
+    if (val.isTensor()) {
+      return static_cast<T>(utils::val_at(dim, val.toConstTensor().sizes()));
+    } else if (val.isTensorRef()) {
+      return static_cast<T>(utils::val_at(dim, val.toConstTensorRef().sizes));
+    }
+    VK_THROW("Could not get sizes of value with type ", val.type());
+  }
+
   vkapi::ScalarType dtype_of(const ValueRef idx) const;
 
   inline utils::uvec3 image_extents_of(const ValueRef idx) const {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.glsl
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, "buffer")}
+${layout_declare_tensor(1, "r", "t_mat1", DTYPE, "buffer")}
+${layout_declare_tensor(2, "r", "t_mat2", DTYPE, "buffer")}
+${layout_declare_ubo(3, "ivec4", "out_sizes")}
+${layout_declare_ubo(4, "ivec4", "out_strides")}
+${layout_declare_ubo(5, "ivec4", "mat1_sizes")}
+${layout_declare_ubo(6, "ivec4", "mat1_strides")}
+${layout_declare_ubo(7, "ivec4", "mat2_sizes")}
+${layout_declare_ubo(8, "ivec4", "mat2_strides")}
+${layout_declare_ubo(9, "int", "out_numel")}
+
+#include "indexing_utils.h"
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+void main() {
+  const ivec4 out_idx = ivec4(
+      gl_GlobalInvocationID.x,
+      gl_GlobalInvocationID.y,
+      gl_GlobalInvocationID.z % out_sizes.z,
+      gl_GlobalInvocationID.z / out_sizes.z);
+
+  if (any(greaterThanEqual(out_idx, out_sizes))) {
+    return;
+  }
+
+  int mat1_id = to_buffer_id(
+      ivec4(0, out_idx.y, out_idx.z, out_idx.w), mat1_strides);
+  int mat2_id = to_buffer_id(
+      ivec4(out_idx.x, 0, out_idx.z, out_idx.w), mat2_strides);
+
+  T sum = T(0.0);
+  for (int i = 0; i < mat1_sizes.x; ++i) {
+    sum += t_mat1[mat1_id] * t_mat2[mat2_id];
+
+    mat1_id += mat1_strides.x;
+    mat2_id += mat2_strides.y;
+  }
+
+  const int out_id = to_buffer_id(out_idx, out_strides);
+  t_out[out_id] = T(sum);
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_buffer.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+matmul_naive_buffer:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: buffer
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: matmul_naive_buffer
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.glsl
@@ -16,17 +16,11 @@ $if MAT2_IS_TRANSPOSED:
 #include "indexing_utils.h"
 #include "matmul.h"
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly image3D im_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat1;
-layout(set = 0, binding = 2) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} im_mat2;
-
-layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
-  ivec4 in_sizes;
-};
+${layout_declare_tensor(0, "w", "im_out", DTYPE, "texture3d")}
+${layout_declare_tensor(1, "r", "im_mat1", DTYPE, "texture3d")}
+${layout_declare_tensor(2, "r", "im_mat2", DTYPE, "texture3d")}
+${layout_declare_ubo(3, "ivec3", "out_limits")}
+${layout_declare_ubo(4, "ivec4", "in_sizes")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_naive_texture3d.yaml
@@ -4,10 +4,10 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-matmul_naive:
+matmul_naive_texture3d:
   parameter_names_with_default_values:
     DTYPE: float
-    NDIM: 3
+    STORAGE: texture3d
     MAT1_PACKING: W_packed
     MAT2_PACKING: H_packed
     MAT2_IS_TRANSPOSED: false
@@ -16,9 +16,9 @@ matmul_naive:
       - VALUE: float
       - VALUE: half
   shader_variants:
-    - NAME: matmul_naive_W_packed_H_packed
-    - NAME: matmul_naive_W_packed_W_packed
+    - NAME: matmul_naive_texture3d_W_packed_H_packed
+    - NAME: matmul_naive_texture3d_W_packed_W_packed
       MAT2_PACKING: W_packed
-    - NAME: matmul_transposed_naive_W_packed_W_packed
+    - NAME: matmul_transposed_naive_texture3d_W_packed_W_packed
       MAT2_PACKING: W_packed
       MAT2_IS_TRANSPOSED: true
diff --git a/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp b/backends/vulkan/runtime/graph/ops/impl/MatMul.cpp
@@ -62,7 +62,48 @@ void resize_matmul_node(
   out->virtual_resize(new_out_sizes);
 }
 
-void add_matmul_naive_node(
+void add_matmul_naive_buffer_node(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2_data,
+    const ValueRef out,
+    const ValueRef mat2_is_transposed) {
+  ValueRef mat2 = prepack_if_tensor_ref(graph, mat2_data, utils::kHeightPacked);
+
+  std::string kernel_name = "matmul_naive_buffer";
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  utils::uvec3 global_size = {
+      graph.size_at<uint32_t>(-1, out),
+      graph.size_at<uint32_t>(-2, out),
+      graph.size_at<uint32_t>(-3, out) * graph.size_at<uint32_t>(-4, out)};
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      VK_KERNEL_FROM_STR(kernel_name),
+      global_size,
+      graph.create_local_wg_size(global_size),
+      // Inputs and Outputs
+      {{out, vkapi::MemoryAccessType::WRITE},
+       {{mat1, mat2}, vkapi::MemoryAccessType::READ}},
+      // Shader params buffers
+      {
+          graph.sizes_ubo(out),
+          graph.strides_ubo(out),
+          graph.sizes_ubo(mat1),
+          graph.strides_ubo(mat1),
+          graph.sizes_ubo(mat2),
+          graph.strides_ubo(mat2),
+          graph.numel_ubo(out),
+      },
+      // Specialization Constants
+      {},
+      // Resizing Logic
+      resize_matmul_node,
+      {mat2_is_transposed}));
+}
+
+void add_matmul_naive_texture3d_node(
     ComputeGraph& graph,
     const ValueRef mat1,
     const ValueRef mat2_data,
@@ -74,6 +115,7 @@ void add_matmul_naive_node(
       ? "matmul_transposed_naive"
       : "matmul_naive";
   kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, graph.storage_type_of(out));
   add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat1));
   add_memory_layout_suffix(kernel_name, graph.memory_layout_of(mat2));
   add_dtype_suffix(kernel_name, graph.dtype_of(out));
@@ -174,12 +216,16 @@ void add_matmul_node(
     const ValueRef mat2_data,
     const ValueRef out,
     const ValueRef mat2_is_transposed) {
-  if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) {
+  if (graph.is_buffer_storage(out)) {
+    add_matmul_naive_buffer_node(
+        graph, mat1, mat2_data, out, mat2_is_transposed);
+  } else if (graph.memory_layout_of(mat1) == utils::kChannelsPacked) {
     add_matmul_optimized_node(graph, mat1, mat2_data, out, mat2_is_transposed);
   } else if (graph.memory_layout_of(mat1) == utils::kWidthPacked) {
-    add_matmul_naive_node(graph, mat1, mat2_data, out, mat2_is_transposed);
+    add_matmul_naive_texture3d_node(
+        graph, mat1, mat2_data, out, mat2_is_transposed);
   } else {
-    VK_THROW("Input should be channel packed or width packed.");
+    VK_THROW("Input texture should be channel packed or width packed.");
   }
 }
 
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -70,6 +70,7 @@ def get_mm_inputs():
     test_suite.prepacked_args = ["mat2"]
     # ATen matmul doesn't support half
     test_suite.dtypes = ["at::kFloat"]
+    test_suite.storage_types = ["utils::kTexture3D", "utils::kBuffer"]
     test_suite.layouts = [
         "utils::kWidthPacked",
         "utils::kChannelsPacked",
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -2282,24 +2282,28 @@ void test_binary_op(
   }
 }
 
-#define CALL_TEST_FN_FORALL_CONDITIONS(_)                                 \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, false)    \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, false)   \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, false) \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, true)     \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_HEIGHT_PACKED, true)    \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, true)
-
-#define CALL_TEST_FN_FOR_W_PACKED(_)                                   \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, false) \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_WIDTH_PACKED, true)
-
-#define CALL_TEST_FN_FOR_C_PACKED(_)                                      \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, false) \
-  _(vkapi::kFloat, utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED, true)
+#define CALL_TEST_FN_FORALL_CONDITIONS(_)                            \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false)    \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked, false)   \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true)     \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kHeightPacked, true)    \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true)
+
+#define CALL_TEST_FN_FOR_W_PACKED(_)                              \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, false) \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kWidthPacked, true)  \
+  _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, false)    \
+  _(vkapi::kFloat, utils::kBuffer, utils::kWidthPacked, true)
+
+#define CALL_TEST_FN_FOR_C_PACKED(_)                                 \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, false) \
+  _(vkapi::kFloat, utils::kTexture3D, utils::kChannelsPacked, true)  \
+  _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, false)    \
+  _(vkapi::kFloat, utils::kBuffer, utils::kChannelsPacked, true)
 
 TEST(VulkanComputeGraphOpsTest, add_smoke_test) {
-#define RUN_TESTS(dtype, layout, prepack)                                  \
+#define RUN_TESTS(dtype, storage, layout, prepack)                         \
   test_binary_op("add", {17, 21}, {17, 21}, dtype, layout, prepack);       \
   test_binary_op("add", {17, 21}, {1, 1}, dtype, layout, prepack);         \
   test_binary_op("sub", {11, 22}, {11, 22}, dtype, layout, prepack);       \
@@ -2320,9 +2324,11 @@ void test_mm(
     int K,
     int N,
     vkapi::ScalarType dtype,
+    utils::StorageType storage_type,
     utils::GPUMemoryLayout memory_layout,
     bool prepack = true) {
   GraphConfig config;
+  config.set_storage_type_override(storage_type);
   ComputeGraph graph(config);
 
   std::vector<int64_t> mat1_size = {M, K};
@@ -2379,38 +2385,42 @@ void test_mm(
 }
 
 TEST(VulkanComputeGraphOpsTest, mm_smoke_test) {
-#define RUN_TESTS(dtype, layout, prepack) \
-  test_mm(                                \
-      /*B = */ 1,                         \
-      /*M = */ 31,                        \
-      /*K = */ 127,                       \
-      /*N = */ 23,                        \
-      dtype,                              \
-      layout,                             \
-      prepack);                           \
-  test_mm(                                \
-      /*B = */ 5,                         \
-      /*M = */ 31,                        \
-      /*K = */ 127,                       \
-      /*N = */ 23,                        \
-      dtype,                              \
-      layout,                             \
-      prepack);                           \
-  test_mm(                                \
-      /*B = */ 7,                         \
-      /*M = */ 13,                        \
-      /*K = */ 89,                        \
-      /*N = */ 17,                        \
-      dtype,                              \
-      layout,                             \
-      prepack);                           \
-  test_mm(                                \
-      /*B = */ 1,                         \
-      /*M = */ 13,                        \
-      /*K = */ 89,                        \
-      /*N = */ 17,                        \
-      dtype,                              \
-      layout,                             \
+#define RUN_TESTS(dtype, storage_type, layout, prepack) \
+  test_mm(                                              \
+      /*B = */ 1,                                       \
+      /*M = */ 31,                                      \
+      /*K = */ 127,                                     \
+      /*N = */ 23,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
+      prepack);                                         \
+  test_mm(                                              \
+      /*B = */ 5,                                       \
+      /*M = */ 31,                                      \
+      /*K = */ 127,                                     \
+      /*N = */ 23,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
+      prepack);                                         \
+  test_mm(                                              \
+      /*B = */ 7,                                       \
+      /*M = */ 13,                                      \
+      /*K = */ 89,                                      \
+      /*N = */ 17,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
+      prepack);                                         \
+  test_mm(                                              \
+      /*B = */ 1,                                       \
+      /*M = */ 13,                                      \
+      /*K = */ 89,                                      \
+      /*N = */ 17,                                      \
+      dtype,                                            \
+      storage_type,                                     \
+      layout,                                           \
       prepack);
 
   CALL_TEST_FN_FOR_W_PACKED(RUN_TESTS);

Original file line number	Diff line number	Diff line change
`@@ -368,7 +368,7 @@ utils::uvec3 ComputeGraph::create_local_wg_size(`
`368`	`368`	`}`
`369`	`369`
`370`	`370`	`utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {`
`371`		`- return create_local_wg_size(image_extents_of(idx));`
	`371`	`+ return create_local_wg_size(create_global_wg_size(idx));`
`372`	`372`	`}`
`373`	`373`
`374`	`374`	`void ComputeGraph::copy_into_staging(`