Aten _To_Copy

Abhi-hpp · facebook-github-bot · commit c2297a5c1ed5 · 2024-10-09T10:43:22.000-07:00
Summary:
Implement aten._to_copy. Currently we are only interested in fp32 &lt;-&gt; fp16 conversions, but it should theoritically support other dtype conversions too. I noticed an issue with int conversions so limited it to just fp32 and fp16 for now.

Note: Most driver implementations of fp16 cast does not "round up" the result, therefore there might be 1 bit difference between vulkan output and cpu torch.to. Explained in greater detail in the comments.

Differential Revision: D64080303
diff --git a/backends/vulkan/partitioner/supported_ops.py b/backends/vulkan/partitioner/supported_ops.py
@@ -70,6 +70,7 @@ def __contains__(self, op):
     exir_ops.edge.aten.sin.default,
     exir_ops.edge.aten.sqrt.default,
     exir_ops.edge.aten.tanh.default,
+    exir_ops.edge.aten._to_copy.default,
     # Matrix Multiplication
     exir_ops.edge.aten.bmm.default,
     exir_ops.edge.aten.mm.default,
diff --git a/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp b/backends/vulkan/runtime/graph/ops/impl/ToCopy.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/BlitNode.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+#include <set>
+
+namespace vkcompute {
+
+void resize_to_copy_op_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr self = graph->get_tensor(args[1].refs[0]);
+
+  out->virtual_resize(self->sizes());
+}
+
+void add_to_copy_node(ComputeGraph& graph, ValueRef in, ValueRef out) {
+  ValueRef arg = prepack_if_tensor_ref(graph, in);
+  vTensorPtr t_in = graph.get_tensor(in);
+  vTensorPtr t_out = graph.get_tensor(out);
+  std::set<vkapi::ScalarType> supported_types = {
+      vkapi::ScalarType::Float, vkapi::ScalarType::Half};
+  VK_CHECK_COND(
+      supported_types.find(t_in->dtype()) != supported_types.end() &&
+          supported_types.find(t_out->dtype()) != supported_types.end(),
+      "Unsupported dtype for to_copy, only Float and Half are currently supported");
+  graph.execute_nodes().emplace_back(new BlitNode(graph, arg, out));
+}
+
+void to_copy(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  return add_to_copy_node(graph, args[0], args[7]);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten._to_copy.default, to_copy);
+}
+} // namespace vkcompute
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -3206,3 +3206,104 @@ TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) {
     test_transpose_view_mm(2, 7, 17, 5, storage_type);
   }
 }
+
+void test_to_copy() {
+  GraphConfig config;
+  config.set_storage_type_override(utils::kTexture3D);
+  ComputeGraph graph(config);
+  int M = 8;
+  int N = 8;
+  int K = 8;
+  // Build graph
+  IOValueRef in = graph.add_input_tensor(
+      {1, M, N, K},
+      vkapi::kFloat,
+      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
+
+  std::vector<float> data_in =
+      create_random_float_buffer(M * N * K, -1024, 1024);
+  graph.copy_into_staging(in.staging, data_in.data(), data_in.size());
+
+  IOValueRef out;
+  out.value = graph.add_tensor(
+      {1, M, N, K},
+      vkapi::kHalf,
+      utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
+
+  auto op = VK_GET_OP_FN("aten._to_copy.default");
+  op(graph,
+     {in.value,
+      graph.add_none(),
+      graph.add_none(),
+      graph.add_none(),
+      graph.add_none(),
+      graph.add_none(),
+      graph.add_none(),
+      out.value});
+
+  out.staging = graph.set_output_tensor(out.value);
+
+  graph.prepare();
+  graph.encode_prepack();
+  graph.prepack();
+  graph.encode_execute();
+  graph.propagate_resize();
+  graph.execute();
+
+  std::vector<torch::executor::Half> output_data(graph.numel_of(out.value));
+  graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
+
+  EXPECT_EQ(data_in.size(), output_data.size());
+
+  float mse_ex = 0.0f;
+  float mse_vk = 0.0f;
+
+  // check results
+  for (size_t i = 0; i < output_data.size(); ++i) {
+    float input = data_in[i];
+    torch::executor::Half expected_output =
+        static_cast<torch::executor::Half>(input);
+    uint16_t* expected_bits = reinterpret_cast<uint16_t*>(&expected_output);
+    torch::executor::Half output = output_data[i];
+    uint16_t* output_bits = reinterpret_cast<uint16_t*>(&output);
+
+    std::cout << "input = " << input << "(0b"
+              << std::bitset<32>(*reinterpret_cast<uint32_t*>(&input))
+              << "), expected output = " << expected_output << "(0b"
+              << std::bitset<16>(*expected_bits)
+              << "), recieved output = " << output << "(0b"
+              << std::bitset<16>(*output_bits) << ")" << std::endl;
+
+    // Note: Torch executor half "rounds up" when converting to fp16 whereas
+    // most driver implementations of Vulkan's opFConvert() just truncates the
+    // extra bits for performance (rounding introduces conditional).
+    // Example:
+    // INPUT F32 = 25.248 (sign{0b0}, exp{0b10000011},
+    // mantissa{0b10010011111101111100111}),
+    // TORCH HALF OUTPUT F16 = 25.25 (sign{0b0}, exp{0b10011},
+    // mantissa{0b1001010000}),
+    // VULKAN OUTPUT F16 = 25.2344 (sign{0b0}, exp{0b10011},
+    // mantissa{0b1001001111})
+    // Note:
+    // The vulkan mantissa exactly matches the first 10
+    // bits of the input 23 bit mantissa. But since the 11th bit is 1, the
+    // torch half output is rounded up (essentially adding a 1).
+    // Vulkan mantissa{0b1001001111} + 1 = Torch half mantissa{0b1001010000}
+
+    EXPECT_TRUE(
+        (*output_bits == *expected_bits) ||
+        /*rounding error*/ ((*output_bits + 1u) == *expected_bits));
+    mse_ex += std::pow(expected_output - input, 2);
+    mse_vk += std::pow(output - input, 2);
+  }
+
+  mse_ex /= output_data.size();
+  mse_vk /= output_data.size();
+  std::cout << "========================================================="
+            << std::endl;
+  std::cout << "mse_ex = " << mse_ex << ", mse_vk = " << mse_vk << std::endl;
+}
+
+TEST(VulkanComputeGraphOpsTest, test_to_copy) {
+  test_to_copy();
+}