Update base for Update on "Use std::variant to implement pytree Key"

swolchok · swolchok · commit 108116cd26d2 · 2024-11-07T08:45:21.000-08:00
Key was a struct that should've been a union; std::variant makes using a union much easier. Differential Revision: [D65575184](https://our.internmc.facebook.com/intern/diff/D65575184/) [ghstack-poisoned]
diff --git a/.github/workflows/ghstack_land.yml b/.github/workflows/ghstack_land.yml
@@ -5,6 +5,7 @@ on:
     branches:
       - 'gh/cccclai/[0-9]+/base'
       - 'gh/dbort/[0-9]+/base'
+      - 'gh/dvorjackz/[0-9]+/base'
       - 'gh/guangy10/[0-9]+/base'
       - 'gh/helunwencser/[0-9]+/base'
       - 'gh/jorgep31415/[0-9]+/base'
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
@@ -448,16 +448,21 @@ def run_tosa_ref_model(
                 ), "There are no quantization parameters, check output parameters"
                 tosa_ref_output = (tosa_ref_output - quant_param.zp) * quant_param.scale
 
+            if tosa_ref_output.dtype == np.double:
+                tosa_ref_output = tosa_ref_output.astype("float32")
+
             # tosa_output is a numpy array, convert to torch tensor for comparison
-            tosa_ref_outputs.append(torch.from_numpy(tosa_ref_output.astype("float32")))
+            tosa_ref_outputs.append(torch.from_numpy(tosa_ref_output))
 
         return tosa_ref_outputs
 
 
 def prep_data_for_save(
     data, is_quantized: bool, input_name: str, quant_param: QuantizationParams
 ):
-    data_np = np.array(data.detach(), order="C").astype(np.float32)
+    data_np = np.array(data.detach(), order="C").astype(
+        f"{data.dtype}".replace("torch.", "")
+    )
 
     if is_quantized:
         assert quant_param.node_name in input_name, (
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -66,6 +66,12 @@
 lib.define(
     "quantized_conv.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_conv.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)"
+)
 
 lib.define(
     "quantized_matmul(Tensor X, int X_zero_point, Tensor Y, int Y_zero_point, Tensor? bias, int out_multiplier, int out_shift, int out_zero_point, bool transposed=False) -> (Tensor Z)"
@@ -171,6 +177,54 @@ def quantized_conv_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_conv.per_tensor")
+def quantized_conv_per_tensor_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+    channel_last: bool = False,
+) -> torch.Tensor:
+    if channel_last:
+        out_channels, *kernel_size, _ = weight.shape
+    else:
+        out_channels, _, *kernel_size = weight.shape
+
+    in_size = input.shape
+    # Assert that the input tensor has at least 3 dimensions, and at most 6
+    assert len(in_size) > 2
+    assert len(in_size) < 6
+
+    # Compute the output tensor size
+    output_size = (
+        get_conv1d_output_size(
+            in_size,
+            out_channels,
+            stride[1],
+            padding[1],
+            dilation[1],
+            kernel_size[0],
+            channel_last,
+        )
+        if len(in_size) == 3
+        else get_conv2d_output_size(
+            in_size, out_channels, stride, padding, dilation, kernel_size, channel_last
+        )
+    )
+
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
 @register_fake("cadence::quantized_layer_norm")
 def quantized_layer_norm_meta(
     input: torch.Tensor,
diff --git a/backends/vulkan/runtime/gen_vulkan_spv.py b/backends/vulkan/runtime/gen_vulkan_spv.py
@@ -540,6 +540,7 @@ def __init__(
         env: Dict[Any, Any],
         glslc_path: Optional[str],
         glslc_flags: str = "",
+        replace_u16vecn: bool = False,
     ) -> None:
         if isinstance(src_dir_paths, str):
             self.src_dir_paths = [src_dir_paths]
@@ -549,6 +550,7 @@ def __init__(
         self.env = env
         self.glslc_path = glslc_path
         self.glslc_flags = glslc_flags
+        self.replace_u16vecn = replace_u16vecn
 
         self.glsl_src_files: Dict[str, str] = {}
         self.template_yaml_files: List[str] = []
@@ -705,6 +707,22 @@ def constructOutputMap(self) -> None:
                     self.create_shader_params(),
                 )
 
+    def maybe_replace_u16vecn(self, input_text: str) -> str:
+        """
+        There is a latency benefit to using u16vecn variables to store texture position
+        variables instead of ivecn, likely due to reduced register pressure. However,
+        SwiftShader does not support 16 bit integer types in shaders, so this is a crude
+        way to fallback to using ivecn to store texture positions so that testing with
+        SwiftShader is still possible.
+        """
+        if not self.replace_u16vecn:
+            return input_text
+        if "codegen-nosub" in input_text:
+            return input_text
+
+        input_text = input_text.replace("u16vec", "ivec")
+        return input_text
+
     def generateSPV(self, output_dir: str) -> Dict[str, str]:
         output_file_map = {}
 
@@ -716,6 +734,7 @@ def process_shader(shader_paths_pair):
 
             with codecs.open(source_glsl, "r", encoding="utf-8") as input_file:
                 input_text = input_file.read()
+                input_text = self.maybe_replace_u16vecn(input_text)
                 output_text = preprocess(input_text, shader_params)
 
             glsl_out_path = os.path.join(output_dir, f"{shader_name}.glsl")
@@ -1029,6 +1048,7 @@ def main(argv: List[str]) -> int:
     parser.add_argument("-c", "--glslc-path", required=True, help="")
     parser.add_argument("-t", "--tmp-dir-path", required=True, help="/tmp")
     parser.add_argument("-o", "--output-path", required=True, help="")
+    parser.add_argument("--replace-u16vecn", action="store_true", default=False)
     parser.add_argument("--optimize_size", action="store_true", help="")
     parser.add_argument("--optimize", action="store_true", help="")
     parser.add_argument(
@@ -1056,7 +1076,11 @@ def main(argv: List[str]) -> int:
         glslc_flags += "-O"
 
     shader_generator = SPVGenerator(
-        options.glsl_paths, env, options.glslc_path, glslc_flags
+        options.glsl_paths,
+        env,
+        options.glslc_path,
+        glslc_flags=glslc_flags,
+        replace_u16vecn=options.replace_u16vecn,
     )
     output_spv_files = shader_generator.generateSPV(options.tmp_dir_path)
 
diff --git a/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl b/backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
@@ -6,6 +6,8 @@
  * LICENSE file in the root directory of this source tree.
  */
 
+// codegen-nosub
+
 #version 450 core
 
 #define PRECISION ${PRECISION}
diff --git a/backends/vulkan/targets.bzl b/backends/vulkan/targets.bzl
@@ -27,6 +27,7 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False):
         select({
             "DEFAULT": "",
             "ovr_config//os:android": "--optimize",
+            "ovr_config//os:linux": "--replace-u16vecn",
         })
     )
 
diff --git a/devtools/inspector/_inspector_utils.py b/devtools/inspector/_inspector_utils.py
@@ -112,6 +112,7 @@ def get_scalar_type_size(scalar_type: ScalarType) -> Tuple[torch.dtype, int]:
             ScalarType.BYTE: (torch.uint8, 1),
             ScalarType.CHAR: (torch.int8, 1),
             ScalarType.BOOL: (torch.bool, 1),
+            ScalarType.BITS16: (torch.uint16, 2),
             ScalarType.SHORT: (torch.int16, 2),
             ScalarType.HALF: (torch.float16, 2),
             ScalarType.INT: (torch.int, 4),
diff --git a/exir/passes/executorch_prim_ops_registry.py b/exir/passes/executorch_prim_ops_registry.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import math
 import operator
 from typing import Dict, Set, Union
 
@@ -14,6 +15,8 @@
 from torch._ops import OpOverload
 from torch.library import Library
 
+# pyre-unsafe
+
 
 executorch_prims_lib = Library("executorch_prim", "DEF")
 
@@ -91,7 +94,13 @@ def neg(a: _SymScalar) -> _SymScalar:
     return -a  # pyre-ignore
 
 
+@bind_pattern_to_op(executorch_prims_lib, "trunc.Scalar(Scalar a) -> Scalar")
+def trunc(a: _SymScalar) -> _SymScalar:
+    return math.trunc(a)  # pyre-ignore
+
+
 _PYTHON_SYM_OPS_TO_EXECUTORCH_SYM_OPS: Dict[OpOverload, OpOverload] = {
+    math.trunc: ops.backend.executorch_prim.trunc.Scalar,
     operator.sub: ops.backend.executorch_prim.sub.Scalar,
     operator.mul: ops.backend.executorch_prim.mul.Scalar,
     operator.add: ops.backend.executorch_prim.add.Scalar,
diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl
@@ -1,10 +1,14 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load(
+    "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
+    "get_vec_preprocessor_flags",
+    "get_vec_deps",
+)
 load(
     "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
     "get_compiler_optimization_flags",
 )
 
-
 def define_common_targets():
     """Defines targets that should be shared between fbcode and xplat.
 
@@ -26,6 +30,7 @@ def define_common_targets():
                 "op_sdpa.h",
                 "op_update_quantized_cache.h",
             ],
+            preprocessor_flags = get_vec_preprocessor_flags(),
             exported_deps = [
                 "//executorch/runtime/kernel:kernel_includes",
                 "//executorch/kernels/portable/cpu:scalar_utils",
@@ -38,7 +43,7 @@ def define_common_targets():
             deps = [
                 "//executorch/kernels/portable/cpu/util:reduce_util",
                 "//executorch/extension/llm/custom_ops/spinquant:fast_hadamard_transform",
-            ],
+            ] + get_vec_deps(),
             compiler_flags = ["-Wno-missing-prototypes", "-Wno-global-constructors"] + get_compiler_optimization_flags(),
             visibility = [
                 "//executorch/...",
diff --git a/kernels/optimized/lib_defs.bzl b/kernels/optimized/lib_defs.bzl
@@ -15,16 +15,44 @@ load(
 # functions in order to declare the required compiler flags needed in order to
 # access CPU vector intrinsics.
 
-def get_vec_android_preprocessor_flags():
-    preprocessor_flags = [
-        (
-            "^android-arm64.*$",
-            [
+def get_vec_preprocessor_flags():
+    if not runtime.is_oss:
+        # various ovr_configs are not available in oss
+        preprocessor_flags = select({
+            "ovr_config//os:linux-x86_64": [
                 "-DET_BUILD_ARM_VEC256_WITH_SLEEF",
-            ],
-        ),
-    ]
-    return preprocessor_flags
+            ] if not runtime.is_oss else [],
+            "ovr_config//os:iphoneos-arm64": [
+                "-DET_BUILD_ARM_VEC256_WITH_SLEEF",
+            ] if not runtime.is_oss else [],
+            "ovr_config//os:macos-arm64": [
+                "-DET_BUILD_ARM_VEC256_WITH_SLEEF",
+            ] if not runtime.is_oss else [],
+            "ovr_config//os:android-arm64": [
+                "-DET_BUILD_ARM_VEC256_WITH_SLEEF",
+            ] if not runtime.is_oss else [],
+            "DEFAULT": [],
+        })
+        return preprocessor_flags
+    return []
+
+def get_vec_deps():
+    if not runtime.is_oss:
+        # various ovr_configs are not available in oss
+        deps = select({
+            "ovr_config//os:iphoneos-arm64": [
+                "fbsource//third-party/sleef:sleef_arm",
+            ] if not runtime.is_oss else [],
+            "ovr_config//os:macos-arm64": [
+                "fbsource//third-party/sleef:sleef_arm",
+            ] if not runtime.is_oss else [],
+            "ovr_config//os:android-arm64": [
+                "fbsource//third-party/sleef:sleef_arm",
+            ] if not runtime.is_oss else [],
+            "DEFAULT": [],
+        })
+        return deps
+    return []
 
 def get_vec_cxx_preprocessor_flags():
     preprocessor_flags = [
diff --git a/kernels/optimized/op_registration_util.bzl b/kernels/optimized/op_registration_util.bzl
@@ -2,7 +2,8 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
-    "get_vec_android_preprocessor_flags",
+    "get_vec_preprocessor_flags",
+    "get_vec_deps",
 )
 load(
     "@fbsource//xplat/executorch/kernels/portable:op_registration_util.bzl",
@@ -94,8 +95,8 @@ def define_op_library(name, deps):
         compiler_flags = ["-Wno-missing-prototypes"] + get_compiler_optimization_flags(),
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
-        ] + augmented_deps,
-        fbandroid_platform_preprocessor_flags = get_vec_android_preprocessor_flags(),
+        ] + augmented_deps + get_vec_deps(),
+        preprocessor_flags = get_vec_preprocessor_flags(),
         # sleef needs to be added as a direct dependency of the operator target when building for Android,
         # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of
         # dependencies are not transitive
diff --git a/kernels/optimized/test/targets.bzl b/kernels/optimized/test/targets.bzl
@@ -1,7 +1,7 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
-    "get_vec_android_preprocessor_flags",
+    "get_vec_preprocessor_flags",
     "get_vec_cxx_preprocessor_flags",
 )
 load("@fbsource//xplat/executorch/kernels/test:util.bzl", "define_supported_features_lib")
@@ -27,7 +27,7 @@ def _lib_test_bin(name, extra_deps = [], in_cpu = False):
             "//executorch/kernels/optimized{}:{}".format(cpu_path, lib_root),
         ] + extra_deps,
         cxx_platform_preprocessor_flags = get_vec_cxx_preprocessor_flags(),
-        fbandroid_platform_preprocessor_flags = get_vec_android_preprocessor_flags(),
+        preprocessor_flags = get_vec_preprocessor_flags(),
     )
 
 def define_common_targets():
diff --git a/kernels/prim_ops/register_prim_ops.cpp b/kernels/prim_ops/register_prim_ops.cpp
@@ -12,6 +12,8 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/kernel/operator_registry.h>
 
+#include <cmath>
+
 using torch::executor::function::et_copy_index;
 
 namespace torch {
@@ -301,6 +303,20 @@ static Kernel prim_ops[] = {
           }
         }),
 
+    // trunc.Scalar(Scalar a) -> Scalar
+    Kernel(
+        "executorch_prim::trunc.Scalar",
+        [](KernelRuntimeContext& context, EValue** stack) {
+          (void)context;
+          EValue& a = *stack[0];
+          EValue& out = *stack[1];
+          if (a.isDouble()) {
+            out = EValue(static_cast<int64_t>(trunc(a.toDouble())));
+          } else {
+            ET_CHECK_MSG(false, "%zu", (size_t)a.tag);
+          }
+        }),
+
     // executorch_prim::et_copy_index.tensor(tensor, tensor) -> tensor
     Kernel("executorch_prim::et_copy_index.tensor", &et_copy_index),
     // executorch_prim::et_view.default(Tensor, int[]) -> Tensor
diff --git a/kernels/prim_ops/test/prim_ops_test.cpp b/kernels/prim_ops/test/prim_ops_test.cpp
diff --git a/runtime/core/portable_type/string_view.h b/runtime/core/portable_type/string_view.h
diff --git a/shim/xplat/executorch/kernels/optimized/lib_defs.bzl b/shim/xplat/executorch/kernels/optimized/lib_defs.bzl
diff --git a/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl

Original file line number	Diff line number	Diff line change
`@@ -27,6 +27,7 @@ def vulkan_spv_shader_lib(name, spv_filegroups, is_fbcode = False):`
`27`	`27`	`select({`
`28`	`28`	`"DEFAULT": "",`
`29`	`29`	`"ovr_config//os:android": "--optimize",`
	`30`	`+ "ovr_config//os:linux": "--replace-u16vecn",`
`30`	`31`	`})`
`31`	`32`	`)`
`32`	`33`