Skip to content

Commit adcc2f7

Browse files
Abhi-hppfacebook-github-bot
authored andcommitted
Aten _To_Copy (#6055)
Summary: Pull Request resolved: #6055 Implement aten._to_copy. Currently we are only interested in fp32 <-> fp16 conversions, but it should theoritically support other dtype conversions too. I noticed an issue with int conversions so limited it to just fp32 and fp16 for now. Note: Most driver implementations of fp16 cast does not "round up" the result, therefore there might be 1 bit difference between vulkan output and cpu torch.to. Explained in greater detail in the comments. Reviewed By: SS-JIA Differential Revision: D64080303
1 parent b7e4e95 commit adcc2f7

File tree

3 files changed

+149
-0
lines changed

3 files changed

+149
-0
lines changed

backends/vulkan/partitioner/supported_ops.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@ def __contains__(self, op):
7070
exir_ops.edge.aten.sin.default,
7171
exir_ops.edge.aten.sqrt.default,
7272
exir_ops.edge.aten.tanh.default,
73+
exir_ops.edge.aten._to_copy.default,
7374
# Matrix Multiplication
7475
exir_ops.edge.aten.bmm.default,
7576
exir_ops.edge.aten.mm.default,
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <executorch/backends/vulkan/runtime/graph/ops/BlitNode.h>
10+
#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
11+
#include <executorch/backends/vulkan/runtime/graph/ops/impl/Staging.h>
12+
#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
13+
#include <set>
14+
15+
namespace vkcompute {
16+
17+
void resize_to_copy_op_node(
18+
ComputeGraph* graph,
19+
const std::vector<ArgGroup>& args,
20+
const std::vector<ValueRef>& extra_args) {
21+
(void)extra_args;
22+
vTensorPtr out = graph->get_tensor(args[0].refs[0]);
23+
vTensorPtr self = graph->get_tensor(args[1].refs[0]);
24+
25+
out->virtual_resize(self->sizes());
26+
}
27+
28+
void add_to_copy_node(ComputeGraph& graph, ValueRef in, ValueRef out) {
29+
static std::set<vkapi::ScalarType> supported_types = {
30+
vkapi::ScalarType::Float, vkapi::ScalarType::Half};
31+
32+
VK_CHECK_COND(
33+
supported_types.find(graph.dtype_of(in)) != supported_types.end() &&
34+
supported_types.find(graph.dtype_of(out)) != supported_types.end(),
35+
"Unsupported dtype for to_copy, only Float and Half are currently supported");
36+
37+
graph.execute_nodes().emplace_back(new BlitNode(graph, prepack_if_tensor_ref(graph, in), out));
38+
}
39+
40+
void to_copy(ComputeGraph& graph, const std::vector<ValueRef>& args) {
41+
return add_to_copy_node(graph, args[0], args[7]);
42+
}
43+
44+
REGISTER_OPERATORS {
45+
VK_REGISTER_OP(aten._to_copy.default, to_copy);
46+
}
47+
} // namespace vkcompute

backends/vulkan/test/vulkan_compute_api_test.cpp

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3206,3 +3206,104 @@ TEST(VulkanComputeGraphOpsTest, test_transpose_with_mm) {
32063206
test_transpose_view_mm(2, 7, 17, 5, storage_type);
32073207
}
32083208
}
3209+
3210+
void test_to_copy() {
3211+
GraphConfig config;
3212+
config.set_storage_type_override(utils::kTexture3D);
3213+
ComputeGraph graph(config);
3214+
int M = 8;
3215+
int N = 8;
3216+
int K = 8;
3217+
// Build graph
3218+
IOValueRef in = graph.add_input_tensor(
3219+
{1, M, N, K},
3220+
vkapi::kFloat,
3221+
utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3222+
3223+
std::vector<float> data_in =
3224+
create_random_float_buffer(M * N * K, -1024, 1024);
3225+
graph.copy_into_staging(in.staging, data_in.data(), data_in.size());
3226+
3227+
IOValueRef out;
3228+
out.value = graph.add_tensor(
3229+
{1, M, N, K},
3230+
vkapi::kHalf,
3231+
utils::GPUMemoryLayout::TENSOR_CHANNELS_PACKED);
3232+
3233+
auto op = VK_GET_OP_FN("aten._to_copy.default");
3234+
op(graph,
3235+
{in.value,
3236+
graph.add_none(),
3237+
graph.add_none(),
3238+
graph.add_none(),
3239+
graph.add_none(),
3240+
graph.add_none(),
3241+
graph.add_none(),
3242+
out.value});
3243+
3244+
out.staging = graph.set_output_tensor(out.value);
3245+
3246+
graph.prepare();
3247+
graph.encode_prepack();
3248+
graph.prepack();
3249+
graph.encode_execute();
3250+
graph.propagate_resize();
3251+
graph.execute();
3252+
3253+
std::vector<torch::executor::Half> output_data(graph.numel_of(out.value));
3254+
graph.copy_from_staging(out.staging, output_data.data(), output_data.size());
3255+
3256+
EXPECT_EQ(data_in.size(), output_data.size());
3257+
3258+
float mse_ex = 0.0f;
3259+
float mse_vk = 0.0f;
3260+
3261+
// check results
3262+
for (size_t i = 0; i < output_data.size(); ++i) {
3263+
float input = data_in[i];
3264+
torch::executor::Half expected_output =
3265+
static_cast<torch::executor::Half>(input);
3266+
uint16_t* expected_bits = reinterpret_cast<uint16_t*>(&expected_output);
3267+
torch::executor::Half output = output_data[i];
3268+
uint16_t* output_bits = reinterpret_cast<uint16_t*>(&output);
3269+
3270+
std::cout << "input = " << input << "(0b"
3271+
<< std::bitset<32>(*reinterpret_cast<uint32_t*>(&input))
3272+
<< "), expected output = " << expected_output << "(0b"
3273+
<< std::bitset<16>(*expected_bits)
3274+
<< "), recieved output = " << output << "(0b"
3275+
<< std::bitset<16>(*output_bits) << ")" << std::endl;
3276+
3277+
// Note: Torch executor half "rounds up" when converting to fp16 whereas
3278+
// most driver implementations of Vulkan's opFConvert() just truncates the
3279+
// extra bits for performance (rounding introduces conditional).
3280+
// Example:
3281+
// INPUT F32 = 25.248 (sign{0b0}, exp{0b10000011},
3282+
// mantissa{0b10010011111101111100111}),
3283+
// TORCH HALF OUTPUT F16 = 25.25 (sign{0b0}, exp{0b10011},
3284+
// mantissa{0b1001010000}),
3285+
// VULKAN OUTPUT F16 = 25.2344 (sign{0b0}, exp{0b10011},
3286+
// mantissa{0b1001001111})
3287+
// Note:
3288+
// The vulkan mantissa exactly matches the first 10
3289+
// bits of the input 23 bit mantissa. But since the 11th bit is 1, the
3290+
// torch half output is rounded up (essentially adding a 1).
3291+
// Vulkan mantissa{0b1001001111} + 1 = Torch half mantissa{0b1001010000}
3292+
3293+
EXPECT_TRUE(
3294+
(*output_bits == *expected_bits) ||
3295+
/*rounding error*/ ((*output_bits + 1u) == *expected_bits));
3296+
mse_ex += std::pow(expected_output - input, 2);
3297+
mse_vk += std::pow(output - input, 2);
3298+
}
3299+
3300+
mse_ex /= output_data.size();
3301+
mse_vk /= output_data.size();
3302+
std::cout << "========================================================="
3303+
<< std::endl;
3304+
std::cout << "mse_ex = " << mse_ex << ", mse_vk = " << mse_vk << std::endl;
3305+
}
3306+
3307+
TEST(VulkanComputeGraphOpsTest, test_to_copy) {
3308+
test_to_copy();
3309+
}

0 commit comments

Comments
 (0)