[ET-VK] Enable FP16 type in operators

SS-JIA · SS-JIA · commit 33fed6b000eb · 2024-04-16T06:58:12.000-07:00
Differential Revision: [D56189470](https://our.internmc.facebook.com/intern/diff/D56189470/) ghstack-source-id: 222684648 Pull Request resolved: #3059
diff --git a/backends/vulkan/runtime/api/gen_vulkan_spv.py b/backends/vulkan/runtime/api/gen_vulkan_spv.py
@@ -90,9 +90,8 @@ def define_variable(name: str) -> str:
 
 
 def get_buffer_scalar_type(dtype: str) -> str:
-    # TODO(ssjia): use float16_t for half types
     if dtype == "half":
-        return "float"
+        return "float16_t"
     # TODO(ssjia): use int8_t for int8 types
     elif dtype[-1] == "8":
         return dtype[:-1]
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_dw_prepack_weights.glsl
@@ -19,6 +19,9 @@
 
 #include "indexing_utils.h"
 
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+
 layout(std430) buffer;
 
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_prepack_weights.glsl
@@ -19,6 +19,9 @@
 
 #include "indexing_utils.h"
 
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+
 layout(std430) buffer;
 
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv_transpose2d_prepack_weights.glsl
@@ -19,6 +19,9 @@
 
 #include "indexing_utils.h"
 
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+
 layout(std430) buffer;
 
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[2][DTYPE]} image_out;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl b/backends/vulkan/runtime/graph/ops/glsl/image_to_nchw.glsl
@@ -19,6 +19,9 @@
 
 #include "indexing_utils.h"
 
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+
 layout(std430) buffer;
 
 layout(set = 0, binding = 0) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in;
diff --git a/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl b/backends/vulkan/runtime/graph/ops/glsl/nchw_to_image.glsl
@@ -20,6 +20,9 @@
 
 #include "indexing_utils.h"
 
+$if DTYPE == "half":
+  #extension GL_EXT_shader_16bit_storage : require
+
 layout(std430) buffer;
 
 layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
@@ -52,20 +55,21 @@ void main() {
   const ivec4 buf_indices =
       base_index + ivec4(0, 1, 2, 3) * get_packed_stride(cpu_sizes.data);
 
-  SCALAR_T val_x = SCALAR_T(buffer_in.data[buf_indices.x]);
-  SCALAR_T val_y = SCALAR_T(buffer_in.data[buf_indices.y]);
-  SCALAR_T val_z = SCALAR_T(buffer_in.data[buf_indices.z]);
-  SCALAR_T val_w = SCALAR_T(buffer_in.data[buf_indices.w]);
-
-  VEC4_T texel = VEC4_T(val_x, val_y, val_z, val_w);
-
   const int packed_dim_size = get_packed_dim(cpu_sizes.data);
   int packed_idx = get_packed_dim(idx);
 
-  if (packed_idx + 3 >= packed_dim_size) {
-    ivec4 packed_ind = ivec4(packed_idx) + ivec4(0, 1, 2, 3);
-    VEC4_T valid_idx = VEC4_T(lessThan(packed_ind, ivec4(packed_dim_size)));
-    texel = texel * valid_idx;
+  VEC4_T texel = VEC4_T(0);
+  if (packed_idx < packed_dim_size) {
+    texel.x = SCALAR_T(buffer_in.data[buf_indices.x]);
+  }
+  if (packed_idx + 1 < packed_dim_size) {
+    texel.y = SCALAR_T(buffer_in.data[buf_indices.y]);
+  }
+  if (packed_idx + 2 < packed_dim_size) {
+    texel.z = SCALAR_T(buffer_in.data[buf_indices.z]);
+  }
+  if (packed_idx + 3 < packed_dim_size) {
+    texel.w = SCALAR_T(buffer_in.data[buf_indices.w]);
   }
 
   imageStore(image_out, ${get_pos[NDIM]("pos")}, texel);
diff --git a/backends/vulkan/test/op_tests/cases.py b/backends/vulkan/test/op_tests/cases.py
@@ -41,6 +41,7 @@ def get_mm_inputs():
         ],
     )
     test_suite.prepacked_args = ["mat2"]
+    test_suite.dtypes = ["at::kFloat"]
     return test_suite
 
 
@@ -123,7 +124,7 @@ def get_native_layer_norm_inputs():
         [
             ((S1, S2), [S2], (S2), (S2), 0.001),
             ((M, M1, M2), [M2], (M2), (M2), 0.001),
-            ((L, XL, M1, M2), [M2], (M2), (M2), 0.001),
+            ((S, XL, M1, M2), [M2], (M2), (M2), 0.001),
         ]
     )
     test_suite.supports["layouts"] = ["api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED"]
@@ -177,11 +178,3 @@ def get_select_int_inputs():
     "aten.full.default": get_full_inputs(),
     "aten.select.int": get_select_int_inputs(),
 }
-
-prepacked_args = {"aten.mm.default": {"mat2"}}
-
-support_exceptions = {
-    "aten.max_pool2d_with_indices.default": {
-        "layouts": ["api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED"]
-    },
-}
diff --git a/backends/vulkan/test/op_tests/targets.bzl b/backends/vulkan/test/op_tests/targets.bzl
@@ -1,4 +1,6 @@
 load("@fbsource//tools/build_defs:platform_defs.bzl", "ANDROID")
+load("@fbsource//xplat/caffe2:pt_defs.bzl", "get_pt_ops_deps")
+load("@fbsource//xplat/caffe2:pt_ops.bzl", "pt_operator_library")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 def define_common_targets(is_fbcode = False):
@@ -43,6 +45,24 @@ def define_common_targets(is_fbcode = False):
         default_outs = ["."],
     )
 
+    pt_operator_library(
+        name = "all_aten_ops",
+        check_decl = False,
+        include_all_operators = True,
+    )
+
+    runtime.cxx_library(
+        name = "all_aten_ops_lib",
+        srcs = [],
+        define_static_target = False,
+        exported_deps = get_pt_ops_deps(
+            name = "pt_ops_full",
+            deps = [
+                ":all_aten_ops",
+            ],
+        ),
+    )
+
     runtime.cxx_binary(
         name = "compute_graph_op_tests_bin",
         srcs = [
@@ -52,7 +72,7 @@ def define_common_targets(is_fbcode = False):
         deps = [
             "//third-party/googletest:gtest_main",
             "//executorch/backends/vulkan:vulkan_graph_runtime",
-            runtime.external_dep_location("libtorch"),
+            ":all_aten_ops_lib",
         ],
     )
 
@@ -72,6 +92,6 @@ def define_common_targets(is_fbcode = False):
         deps = [
             "//third-party/googletest:gtest_main",
             "//executorch/backends/vulkan:vulkan_graph_runtime",
-            runtime.external_dep_location("libtorch"),
+            ":all_aten_ops_lib",
         ],
     )
diff --git a/backends/vulkan/test/op_tests/utils/codegen.py b/backends/vulkan/test/op_tests/utils/codegen.py
@@ -88,7 +88,6 @@ def __init__(self, op_reg_name: str, f: NativeFunction, suite_def: TestSuite):
         self.dot = "->"
 
         self.args = []
-        self.out = None
         self.refs = {}
 
         self.should_prepack = False
@@ -288,6 +287,7 @@ def set_output(self, ref: ValueRefList) -> str:
         return ret_str
 
     def virtual_resize(self, ref: ValueRefList) -> str:
+        assert isinstance(ref, ValueRef)
         assert ref.src_cpp_type == AT_TENSOR and ref.is_in
         if self.prepack_ref(ref):
             return ""
@@ -296,6 +296,7 @@ def virtual_resize(self, ref: ValueRefList) -> str:
         return ret_str
 
     def copy_into_staging(self, ref: ValueRefList) -> str:
+        assert isinstance(ref, ValueRef)
         assert ref.src_cpp_type == AT_TENSOR and ref.is_in
         if self.prepack_ref(ref):
             return ""
@@ -336,7 +337,7 @@ def check_graph_out(self, ref: ValueRefList) -> str:
                 ret_str += self.check_graph_out(r)
             return ret_str
 
-        return f"EXPECT_TRUE(check_close({ref.src_cpp_name}, vk_{ref.name}));\n"
+        return f"EXPECT_TRUE(check_close({ref.src_cpp_name}, vk_{ref.name}, rtol, atol));\n"
 
     ## Top level code generation
 
@@ -374,11 +375,19 @@ def gen_graph_exec_code(self) -> str:
 
         return graph_exec
 
+    def gen_conditional_skips(self) -> str:
+        skips = f"if (test_dtype == at::kHalf && "
+        skips += f"!{self.graph}{self.dot}context()->adapter_ptr()->has_16bit_storage()) {{\n"
+        skips += "  GTEST_SKIP();"
+        skips += "}\n"
+        return skips
+
     def gen_op_check_fn(self) -> str:
         op_name = self.f.func.name.unambiguous_name()
         op_check_fn = self.gen_decl(f"check_{op_name}") + " {"
         if self.should_prepack:
             op_check_fn = self.gen_decl(f"prepacked_check_{op_name}") + " {"
+        op_check_fn += self.gen_conditional_skips()
         op_check_fn += self.gen_graph_build_code()
         op_check_fn += self.gen_graph_exec_code()
         op_check_fn += self.check_graph_out(self.refs["out"])
@@ -391,19 +400,26 @@ def gen_op_check_fn(self) -> str:
 ##################################
 
 test_fixture_template = """
-class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple<api::StorageType, api::GPUMemoryLayout>> {{
+class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple<at::ScalarType, api::StorageType, api::GPUMemoryLayout>> {{
   protected:
     ComputeGraph* graph;
     at::ScalarType test_dtype = at::kFloat;
+    float rtol = 1e-5;
+    float atol = 1e-5;
 
     void SetUp() override {{
         GraphConfig config;
         api::StorageType default_storage_type;
         api::GPUMemoryLayout default_memory_layout;
-        std::tie(default_storage_type, default_memory_layout) = GetParam();
+        std::tie(test_dtype, default_storage_type, default_memory_layout) = GetParam();
         config.setStorageTypeOverride(default_storage_type);
         config.setMemoryLayoutOverride(default_memory_layout);
         graph = new ComputeGraph(config);
+
+        if (test_dtype == at::kHalf) {{
+            rtol = 1e-2;
+            atol = 1e-2;
+        }}
     }}
 
     void TearDown() override {{
@@ -420,7 +436,7 @@ class GeneratedOpsTest_{op_name} : public ::testing::TestWithParam< ::std::tuple
 
 
 class VkTestSuiteGen(TestSuiteGen):
-    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: List[Any]):
+    def __init__(self, op_reg_name: str, f: NativeFunction, inputs: VkTestSuite):
         super().__init__(f, inputs)
         self.op_reg_name = op_reg_name
         self.generator = ComputeGraphGen(self.op_reg_name, self.f, self.suite_def)
@@ -442,6 +458,8 @@ def generate_fixture_cpp(self) -> str:
         )
 
     def gen_parameterization(self) -> str:
+        # pyre-ignore
+        dtypes = self.suite_def.dtypes
         storage_types = self.suite_def.supports["storage_types"]
         layouts = self.suite_def.supports["layouts"]
 
@@ -450,6 +468,7 @@ def gen_parameterization(self) -> str:
             StorageLayoutCombos_{self.op_name},
             GeneratedOpsTest_{self.op_name},
             ::testing::Combine(
+                ::testing::Values({', '.join(dtypes)}),
                 ::testing::Values({', '.join(storage_types)}),
                 ::testing::Values({', '.join(layouts)})));
         """
diff --git a/backends/vulkan/test/op_tests/utils/codegen_base.py b/backends/vulkan/test/op_tests/utils/codegen_base.py
@@ -39,6 +39,7 @@ class TestSuite:
     input_cases: List[Any]
     prepacked_args = []
     requires_prepack = False
+    dtypes = ["at::kFloat", "at::kHalf"]
 
     def supports_prepack(self):
         return len(self.prepacked_args) > 0
@@ -239,6 +240,6 @@ def generate_preamble(self) -> str:
     def generate_test_suites_cpp(self) -> str:
         return "\n".join([h.generate_suite_cpp() for h in self.suites_gens])
 
-    def add_suite(self, f: NativeFunction, test_suite: TestSuite) -> None:
-        suites_gen = TestSuiteGen(f, test_suite)
+    def add_suite(self, op_reg_name: str, f: NativeFunction, all_input_cases) -> None:
+        suites_gen = TestSuiteGen(f, all_input_cases)
         self.suites_gens.append(suites_gen)
diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py
@@ -54,6 +54,8 @@ def assert_outputs_equal(
                         )
                     )
         else:
+            print(model_output[0])
+            print(ref_output)
             # If one output, eager returns tensor while executor tuple of size 1
             self.assertTrue(
                 torch.allclose(model_output[0], ref_output, atol=atol, rtol=rtol)
@@ -198,8 +200,8 @@ def forward(self, x, y):
 
         sub_module = SubModule()
         sample_inputs = (
-            torch.rand(size=(2, 3), dtype=torch.float32),
-            torch.rand(size=(2, 3), dtype=torch.float32),
+            torch.rand(size=(2, 3), dtype=torch.float16),
+            torch.rand(size=(2, 3), dtype=torch.float16),
         )
 
         self.lower_module_and_test_output(sub_module, sample_inputs)
diff --git a/backends/vulkan/test/vulkan_compute_api_test.cpp b/backends/vulkan/test/vulkan_compute_api_test.cpp
@@ -816,7 +816,7 @@ void run_from_gpu_test(
     api::ScalarType dtype = api::kFloat,
     api::StorageType storage_type = api::StorageType::TEXTURE_3D) {
   vTensor vten =
-      vTensor(api::context(), sizes, api::kFloat, storage_type, memory_layout);
+      vTensor(api::context(), sizes, dtype, storage_type, memory_layout);
 
   std::string kernel_name("idx_fill_texture");
   add_memory_layout_suffix(kernel_name, vten);
@@ -838,16 +838,14 @@ void run_from_gpu_test(
         vten.cpu_sizes_ubo()->buffer());
   }
 
-  api::StorageBuffer staging_buffer(
-      api::context(), api::kFloat, vten.gpu_numel());
+  api::StorageBuffer staging_buffer(api::context(), dtype, vten.gpu_numel());
 
   record_image_to_nchw_op(api::context(), vten, staging_buffer.buffer());
 
   submit_to_gpu();
 
   std::vector<T> data_out(staging_buffer.numel());
-  copy_staging_to_ptr(
-      staging_buffer, data_out.data(), sizeof(float) * staging_buffer.numel());
+  copy_staging_to_ptr(staging_buffer, data_out.data(), staging_buffer.nbytes());
 
   for (int i = 0; i < vten.numel(); i++) {
     CHECK_VALUE(data_out, i, i);
@@ -861,12 +859,16 @@ void run_to_gpu_test(
         api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED,
     api::ScalarType dtype = api::kFloat,
     api::StorageType storage_type = api::StorageType::TEXTURE_3D) {
+  if (dtype == api::kHalf &&
+      !api::context()->adapter_ptr()->has_16bit_storage()) {
+    return;
+  }
+
   vTensor vten =
       vTensor(api::context(), sizes, api::kFloat, storage_type, memory_layout);
 
   // Create and fill input staging buffer
-  api::StorageBuffer staging_buffer_in(
-      api::context(), api::kFloat, vten.gpu_numel());
+  api::StorageBuffer staging_buffer_in(api::context(), dtype, vten.gpu_numel());
 
   std::vector<T> data_in(staging_buffer_in.numel());
   for (int i = 0; i < staging_buffer_in.numel(); i++) {
@@ -876,7 +878,7 @@ void run_to_gpu_test(
 
   // Output staging buffer
   api::StorageBuffer staging_buffer_out(
-      api::context(), api::kFloat, vten.gpu_numel());
+      api::context(), dtype, vten.gpu_numel());
 
   // Copy data in and out of the tensor
   record_nchw_to_image_op(api::context(), staging_buffer_in.buffer(), vten);
@@ -888,9 +890,7 @@ void run_to_gpu_test(
   // Extract data from output staging buffer
   std::vector<T> data_out(staging_buffer_out.numel());
   copy_staging_to_ptr(
-      staging_buffer_out,
-      data_out.data(),
-      sizeof(float) * staging_buffer_out.numel());
+      staging_buffer_out, data_out.data(), staging_buffer_out.nbytes());
 
   // All indices should be equal to the input data
   for (int i = 0; i < vten.numel(); i++) {
@@ -943,7 +943,7 @@ TEST(VulkanToFromGPUShaderTest, to_gpu_and_from_gpu_test_texture) {
 
   for (auto& sizes : to_test) {
     RUN_TESTS(float, api::kFloat)
-    RUN_TESTS(float, api::kHalf)
+    RUN_TESTS(c10::Half, api::kHalf)
   }
 #undef RUN_TESTS
 }

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ def get_mm_inputs():`
`41`	`41`	`],`
`42`	`42`	`)`
`43`	`43`	`test_suite.prepacked_args = ["mat2"]`
	`44`	`+ test_suite.dtypes = ["at::kFloat"]`
`44`	`45`	`return test_suite`
`45`	`46`
`46`	`47`
`@@ -123,7 +124,7 @@ def get_native_layer_norm_inputs():`
`123`	`124`	`[`
`124`	`125`	`((S1, S2), [S2], (S2), (S2), 0.001),`
`125`	`126`	`((M, M1, M2), [M2], (M2), (M2), 0.001),`
`126`		`- ((L, XL, M1, M2), [M2], (M2), (M2), 0.001),`
	`127`	`+ ((S, XL, M1, M2), [M2], (M2), (M2), 0.001),`
`127`	`128`	`]`
`128`	`129`	`)`
`129`	`130`	`test_suite.supports["layouts"] = ["api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED"]`
`@@ -177,11 +178,3 @@ def get_select_int_inputs():`
`177`	`178`	`"aten.full.default": get_full_inputs(),`
`178`	`179`	`"aten.select.int": get_select_int_inputs(),`
`179`	`180`	`}`
`180`		`-`
`181`		`-prepacked_args = {"aten.mm.default": {"mat2"}}`
`182`		`-`
`183`		`-support_exceptions = {`
`184`		`- "aten.max_pool2d_with_indices.default": {`
`185`		`- "layouts": ["api::GPUMemoryLayout::TENSOR_CHANNELS_PACKED"]`
`186`		`- },`
`187`		`-}`
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,8 @@ def assert_outputs_equal(`
`54`	`54`	`)`
`55`	`55`	`)`
`56`	`56`	`else:`
	`57`	`+ print(model_output[0])`
	`58`	`+ print(ref_output)`
`57`	`59`	`# If one output, eager returns tensor while executor tuple of size 1`
`58`	`60`	`self.assertTrue(`
`59`	`61`	`torch.allclose(model_output[0], ref_output, atol=atol, rtol=rtol)`
`@@ -198,8 +200,8 @@ def forward(self, x, y):`
`198`	`200`
`199`	`201`	`sub_module = SubModule()`
`200`	`202`	`sample_inputs = (`
`201`		`- torch.rand(size=(2, 3), dtype=torch.float32),`
`202`		`- torch.rand(size=(2, 3), dtype=torch.float32),`
	`203`	`+ torch.rand(size=(2, 3), dtype=torch.float16),`
	`204`	`+ torch.rand(size=(2, 3), dtype=torch.float16),`
`203`	`205`	`)`
`204`	`206`
`205`	`207`	`self.lower_module_and_test_output(sub_module, sample_inputs)`