[ET-VK] Introduce virtual_transpose() to vTensor for no copy transposition

SS-JIA · SS-JIA · commit 15caddcb1a0f · 2024-09-13T11:17:03.000-07:00
## Context With `axis_map` integrated into matrix multiplication, we can now test no-copy transposes for texture backed tensors. Transposing a tensor can be done without modifying the storage by swapping elements in the tensor's `axis_map`, and also updating the layout of the tensor if the packed dimension was one of the dims that were transposed. Differential Revision: [D62652009](https://our.internmc.facebook.com/intern/diff/D62652009/) ghstack-source-id: 242452080 Pull Request resolved: #5353
diff --git a/backends/vulkan/runtime/api/containers/Tensor.cpp b/backends/vulkan/runtime/api/containers/Tensor.cpp
@@ -567,6 +567,48 @@ void vTensor::virtual_resize(const std::vector<int64_t>& new_sizes) {
   update_metadata();
 }
 
+/*
+ * Transposing the dim order is a bit unintuitive. dim0 and dim1 have swapped
+ * their "identities", so we need to swap the values of dim0 and dim1 wherever
+ * they appear in the dim order vector. Compare this to just swapping the
+ * elements at dim0 and dim1 in the `sizes` vectors.
+ */
+void transpose_dim_order_inplace(
+    std::vector<int64_t>& dim_order,
+    const int64_t dim0,
+    const int64_t dim1) {
+  for (int i = 0; i < dim_order.size(); ++i) {
+    if (dim_order[i] == dim0) {
+      dim_order[i] = dim1;
+    } else if (dim_order[i] == dim1) {
+      dim_order[i] = dim0;
+    }
+  }
+}
+
+void vTensor::virtual_transpose(const int64_t dim0, const int64_t dim1) {
+  std::iter_swap(sizes_.begin() + dim0, sizes_.begin() + dim1);
+  if (storage_type() == utils::kBuffer) {
+    transpose_dim_order_inplace(dim_order_, dim0, dim1);
+  } else {
+    const int dim0_whcn = sizes_.size() - 1 - dim0;
+    const int dim1_whcn = sizes_.size() - 1 - dim1;
+    // Cannot transpose batch dimension for texture storage
+    VK_CHECK_COND(dim0_whcn < 3 && dim1_whcn < 3);
+
+    std::iter_swap(
+        axis_map_.begin() + dim0_whcn, axis_map_.begin() + dim1_whcn);
+
+    if (packed_dim_whcn_idx() == dim0_whcn) {
+      memory_layout_ = utils::GPUMemoryLayout(dim1_whcn);
+    }
+    if (packed_dim_whcn_idx() == dim1_whcn) {
+      memory_layout_ = utils::GPUMemoryLayout(dim0_whcn);
+    }
+  }
+  update_metadata();
+}
+
 void vTensor::reallocate(const std::vector<int64_t>& new_sizes) {
   sizes_ = new_sizes;
   update_metadata();
diff --git a/backends/vulkan/runtime/api/containers/Tensor.h b/backends/vulkan/runtime/api/containers/Tensor.h
@@ -530,6 +530,11 @@ class vTensor final {
    */
   void virtual_resize(const std::vector<int64_t>& new_sizes);
 
+  /*
+   * Transpose the tensor in-place by updating its metadata.
+   */
+  void virtual_transpose(const int64_t dim0, const int64_t dim1);
+
   /*
    * Discard the underlying VkImage or VkBuffer and re-allocate based on new
    * tensor sizes
diff --git a/backends/vulkan/test/utils/test_utils.cpp b/backends/vulkan/test/utils/test_utils.cpp
@@ -314,6 +314,44 @@ void record_reference_matmul(
       mat2.strides_ubo());
 }
 
+void record_matmul_texture3d(
+    api::Context* context,
+    api::vTensor& out,
+    api::vTensor& mat1,
+    api::vTensor& mat2) {
+  std::string kernel_name = "matmul_naive";
+  kernel_name.reserve(kShaderNameReserve);
+  add_storage_type_suffix(kernel_name, out.storage_type());
+  add_dtype_suffix(kernel_name, out.dtype());
+
+  utils::uvec3 global_wg_size = out.logical_extents();
+
+  vkapi::PipelineBarrier pipeline_barrier{};
+  api::context()->submit_compute_job(
+      VK_KERNEL_FROM_STR(kernel_name),
+      pipeline_barrier,
+      global_wg_size,
+      {8, 8, 1},
+      {out.packed_dim_whcn_idx(),
+       mat1.packed_dim_whcn_idx(),
+       mat2.packed_dim_whcn_idx()},
+      VK_NULL_HANDLE,
+      0,
+      out.image(
+          pipeline_barrier,
+          vkapi::PipelineStage::COMPUTE,
+          vkapi::MemoryAccessType::WRITE),
+      mat1.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
+      mat2.image(pipeline_barrier, vkapi::PipelineStage::COMPUTE),
+      out.sizes_ubo(),
+      out.logical_limits_ubo(),
+      out.axis_map_ubo(),
+      mat1.sizes_ubo(),
+      mat1.axis_map_ubo(),
+      mat2.sizes_ubo(),
+      mat2.axis_map_ubo());
+}
+
 //
 // Input & Output Utilities
 //
diff --git a/backends/vulkan/test/utils/test_utils.h b/backends/vulkan/test/utils/test_utils.h
@@ -121,6 +121,12 @@ void record_reference_matmul(
     api::vTensor& mat1,
     api::vTensor& mat2);
 
+void record_matmul_texture3d(
+    api::Context* context,
+    api::vTensor& out,
+    api::vTensor& mat1,
+    api::vTensor& mat2);
+
 //
 // Input & Output Utilities
 //
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -258,6 +258,48 @@ TEST_F(VulkanComputeAPITest, calculate_tensor_strides_test) {
   }
 }
 
+TEST_F(VulkanComputeAPITest, virtual_transpose_test) {
+  std::vector<int64_t> sizes = {7, 9, 11, 13};
+  // (dim0, dim1), new_sizes, new_dim_order, new_axis_map, new_packed_dim_idx
+  std::vector<std::vector<std::vector<int64_t>>> test_cases = {
+      {{2, 3}, {7, 9, 13, 11}, {0, 1, 3, 2}, {1, 0, 2, 2}, {1}},
+      {{2, 1}, {7, 11, 9, 13}, {0, 2, 1, 3}, {0, 2, 1, 2}, {0}},
+      {{1, 3}, {7, 13, 11, 9}, {0, 3, 2, 1}, {2, 1, 0, 2}, {2}},
+  };
+
+  for (const auto& test_case : test_cases) {
+    const int dim0 = test_case.at(0).at(0);
+    const int dim1 = test_case.at(0).at(1);
+
+    const auto& expected_sizes = test_case.at(1);
+    const auto& expected_dim_order = test_case.at(2);
+    const auto& expected_axis_map = test_case.at(3);
+    const int expected_packed_dim = test_case.at(4).at(0);
+
+    {
+      vTensor a_buffer = vTensor(
+          context(), sizes, vkapi::kFloat, utils::kBuffer, utils::kWidthPacked);
+
+      a_buffer.virtual_transpose(dim0, dim1);
+      EXPECT_TRUE(a_buffer.sizes() == expected_sizes);
+      EXPECT_TRUE(a_buffer.dim_order() == expected_dim_order);
+    }
+
+    {
+      vTensor a_texture = vTensor(
+          context(),
+          sizes,
+          vkapi::kFloat,
+          utils::kTexture3D,
+          utils::kWidthPacked);
+      a_texture.virtual_transpose(dim0, dim1);
+      EXPECT_TRUE(a_texture.sizes() == expected_sizes);
+      EXPECT_TRUE(a_texture.axis_map() == expected_axis_map);
+      EXPECT_TRUE(a_texture.packed_dim_whcn_idx() == expected_packed_dim);
+    }
+  }
+}
+
 TEST_F(VulkanComputeAPITest, vec_test) {
   utils::vec3 v3({1, 2, 3});
   ASSERT_TRUE(v3[0] == 1);
@@ -637,46 +679,60 @@ TEST_F(VulkanComputeAPITest, tensor_no_copy_transpose_test) {
   constexpr int N = 17;
   std::vector<int64_t> mat1_sizes = {M, K};
   std::vector<int64_t> mat2_sizes = {N, K};
-  std::vector<int64_t> mat2_t_sizes = {K, N};
   std::vector<int64_t> out_sizes = {M, N};
 
-  std::vector<int64_t> transposed_dim_order = {1, 0};
-
-  vTensor mat1 = CREATE_FLOAT_BUFFER(mat1_sizes, /*allocate_memory=*/true);
-  vTensor mat2 = CREATE_FLOAT_BUFFER(mat2_sizes, /*allocate_memory=*/true);
-  vTensor out = CREATE_FLOAT_BUFFER(out_sizes, /*allocate_memory=*/true);
-
-  // Generate data
-  std::vector<float> mat1_data =
-      create_random_float_buffer(mat1.staging_buffer_numel());
-  std::vector<float> mat2_data =
-      create_random_float_buffer(mat2.staging_buffer_numel());
-
-  // Create direct view and modify sizes and strides later
-  vTensor mat2_t = vTensor(mat2);
-
-  std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
-  std::vector<float> ref_out =
-      compute_reference_matmul(mat1_data, mat2_t_data, M, K, N);
-
-  // Fill original tensor with some data
-  fill_vtensor(mat1, mat1_data);
-  fill_vtensor(mat2, mat2_data);
+  for (const auto storage_type : {utils::kTexture3D, utils::kBuffer}) {
+    vTensor mat1 = vTensor(
+        context(),
+        mat1_sizes,
+        vkapi::kFloat,
+        storage_type,
+        utils::kWidthPacked);
+    vTensor mat2 = vTensor(
+        context(),
+        mat2_sizes,
+        vkapi::kFloat,
+        storage_type,
+        utils::kWidthPacked);
+    vTensor out = vTensor(
+        context(), out_sizes, vkapi::kFloat, storage_type, utils::kWidthPacked);
+
+    // Generate data
+    std::vector<float> mat1_data =
+        create_random_float_buffer(mat1.staging_buffer_numel());
+    std::vector<float> mat2_data =
+        create_random_float_buffer(mat2.staging_buffer_numel());
+
+    // Create direct view and modify sizes and strides later
+    vTensor mat2_t = vTensor(mat2);
+    // Update sizes and strides of mat2_t to be that of a transposed tensor
+    mat2_t.virtual_transpose(0, 1);
+
+    EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked);
+
+    std::vector<float> mat2_t_data = transpose_matrix(mat2_data, N, K);
+    std::vector<float> ref_out =
+        compute_reference_matmul(mat1_data, mat2_t_data, M, K, N);
 
-  record_reference_matmul(api::context(), out, mat1, mat2_t);
+    // Fill original tensor with some data
+    fill_vtensor(mat1, mat1_data);
+    fill_vtensor(mat2, mat2_data);
 
-  // Update sizes and strides of mat2_t to be that of a transposed tensor
-  mat2_t.virtual_reconfigure(mat2_t_sizes, transposed_dim_order);
-  EXPECT_TRUE(mat2_t.gpu_memory_layout() == utils::kHeightPacked);
+    if (storage_type == utils::kTexture3D) {
+      record_matmul_texture3d(context(), out, mat1, mat2_t);
+    } else {
+      record_reference_matmul(context(), out, mat1, mat2_t);
+    }
 
-  std::vector<float> data_out(out.staging_buffer_numel());
-  // Extract the copy tensor; should contain the data of the original tensor
-  extract_vtensor(out, data_out);
+    std::vector<float> data_out(out.staging_buffer_numel());
+    // Extract the copy tensor; should contain the data of the original tensor
+    extract_vtensor(out, data_out);
 
-  EXPECT_TRUE(data_out.size() == ref_out.size());
+    // EXPECT_TRUE(data_out.size() == ref_out.size());
 
-  for (size_t i = 0; i < data_out.size(); ++i) {
-    EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
+    for (size_t i = 0; i < ref_out.size(); ++i) {
+      EXPECT_TRUE(check_close(data_out[i], ref_out[i]));
+    }
   }
 }