Merge branch 'main' into shoumikhin-patch-6

shoumikhin · web-flow · commit e32dffd07f37 · 2025-05-28T11:00:11.000-07:00
diff --git a/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl b/backends/vulkan/runtime/graph/ops/glsl/conv2d_pw.glsl
@@ -88,10 +88,18 @@ void main() {
     ipos[i] = pos[i] * stride - padding;
   }
 
-  vec4 sum[TILE_SIZE_X * TILE_SIZE_Y];
-  sum[0] = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
-  for (int i = 1; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
-    sum[i] = sum[0];
+  // Final output array where each element is a tensor value.
+  // Tuple of consecutive 4 elements represents a single output texel.
+  float sum[TILE_SIZE_X * TILE_SIZE_Y * 4];
+
+  const vec4 bias = texelFetch(t_bias, ivec2(gpos.z, 0), 0);
+
+  // Initialize the output array with the bias value
+  for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y * 4; i += 4) {
+    sum[i] = bias.x;
+    sum[i + 1] = bias.y;
+    sum[i + 2] = bias.z;
+    sum[i + 3] = bias.w;
   }
 
   int z4 = 0;
@@ -100,14 +108,26 @@ void main() {
     // During prepacking, the weight tensor has been permuted so that the
     // channel (IC) dim is along the x-axis, and the batch (OC) dim is along
     // the z-axis.
-    const vec4 ktex_0 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(0, 0));
-    const vec4 ktex_1 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(1, 0));
-    const vec4 ktex_2 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(2, 0));
-    const vec4 ktex_3 = texelFetchOffset(t_kernel, ivec2(z, gpos.z), 0, ivec2(3, 0));
+    float kernel_values[4 * 4]; // 4 channels, 4 elements per channel
+
+    // Load kernel values from texels to array
+    for (int i = 0; i < 4; ++i) {
+      const vec4 k_tex = texelFetch(t_kernel, ivec2(z + i, gpos.z), 0);
+      kernel_values[i * 4 + 0] = k_tex.x;
+      kernel_values[i * 4 + 1] = k_tex.y;
+      kernel_values[i * 4 + 2] = k_tex.z;
+      kernel_values[i * 4 + 3] = k_tex.w;
+    }
 
-#pragma unroll
     for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
       const vec4 in_tex = texelFetch(t_in, ivec3(ipos[i], z4), 0);
+      // Load the input texel into an array
+      float tex_values[4];
+      tex_values[0] = in_tex.x;
+      tex_values[1] = in_tex.y;
+      tex_values[2] = in_tex.z;
+      tex_values[3] = in_tex.w;
+
       // For 2x2 tile size algorithm works as follows.
       // To explain the calculations below, the contents of one in_tex and the
       // group of 4 texels loaded from t_kernel are shown:
@@ -141,18 +161,20 @@ void main() {
       //
       //  which is what is expressed in the following calculations. This is done
       //  for each output position.
-      sum[i] = fma(in_tex.xxxx, ktex_0, sum[i]);
-      sum[i] = fma(in_tex.yyyy, ktex_1, sum[i]);
-      sum[i] = fma(in_tex.zzzz, ktex_2, sum[i]);
-      sum[i] = fma(in_tex.wwww, ktex_3, sum[i]);
+      for (int j = 0; j < 4; ++j) {
+        sum[i * 4 + j] = tex_values[0] * kernel_values[0 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[1] * kernel_values[4 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[2] * kernel_values[8 + j] + sum[i * 4 + j];
+        sum[i * 4 + j] = tex_values[3] * kernel_values[12 + j] + sum[i * 4 + j];
+      }
     }
   }
 
   for (int i = 0; i < TILE_SIZE_X * TILE_SIZE_Y; ++i) {
     const uint index = (shared_mem_stride * i) + gl_LocalInvocationIndex;
     const ivec3 pos = pos_shared[offset_pos_index(index)];
     if (all(lessThan(pos, out_limits.xyz))) {
-      imageStore(t_out, pos, op(sum[i], out_min, out_max));
+      imageStore(t_out, pos, op(vec4(sum[i * 4], sum[i * 4 + 1], sum[i * 4 + 2], sum[i * 4 + 3]), out_min, out_max));
     }
   }
 }
diff --git a/backends/xnnpack/operators/__init__.py b/backends/xnnpack/operators/__init__.py
@@ -20,6 +20,7 @@
     op_dynamic_quantize_ops,
     op_elu,
     op_floor,
+    op_gelu,
     op_hardswish,
     op_hardtanh,
     op_leaky_relu,
diff --git a/backends/xnnpack/operators/op_gelu.py b/backends/xnnpack/operators/op_gelu.py
@@ -0,0 +1,52 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict
+
+import torch
+from executorch.backends.xnnpack.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import (
+    XNNGelu,
+    XNNGraph,
+    XNode,
+)
+from executorch.backends.xnnpack.utils.utils import get_input_node
+
+
+@register_node_visitor
+class GeluVisitor(NodeVisitor):
+    target = "aten.gelu.default"
+
+    def __init__(self, *args) -> None:
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        xnn_graph: XNNGraph,
+        vals_to_ids: Dict[torch.fx.Node, int],
+        debug_handle: int,
+    ) -> None:
+        self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids)
+
+        # input
+        input_id = vals_to_ids[get_input_node(node, 0)]
+
+        # output
+        output_id = vals_to_ids[node]
+
+        ser_node = XNode(
+            xnode_union=XNNGelu(
+                input_id=input_id,
+                output_id=output_id,
+                flags=0,
+            ),
+            debug_handle=debug_handle,
+        )
+        xnn_graph.xnodes.append(ser_node)
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
@@ -26,6 +26,7 @@
     DeQuantizedPerTensorConfig,
     DivConfig,
     FloorConfig,
+    GeluConfig,
     HardswishConfig,
     # EluConfig,
     HardtanhConfig,
@@ -79,6 +80,7 @@
     DivConfig,
     # EluConfig, # Waiting for PyTorch Pin Update
     FloorConfig,
+    GeluConfig,
     HardtanhConfig,
     HardswishConfig,
     LeakyReLUConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -343,6 +343,13 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
 
 
+class GeluConfig(GenericNodePartitionerConfig):
+    target_name = "gelu.default"
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
+
+
 class HardswishConfig(GenericNodePartitionerConfig):
     target_name = "hardswish.default"
 
diff --git a/backends/xnnpack/partition/configs.py b/backends/xnnpack/partition/configs.py
@@ -65,6 +65,7 @@
     exir_ops.edge.aten.addmm.default,  # TODO(T163877189) add constraint for addmm
     exir_ops.edge.aten.rsqrt.default,
     exir_ops.edge.aten.log.default,
+    exir_ops.edge.aten.gelu.default,
 ]
 
 SUPPORTED_MODULES = [
diff --git a/backends/xnnpack/runtime/XNNCompiler.cpp b/backends/xnnpack/runtime/XNNCompiler.cpp
@@ -1448,6 +1448,36 @@ Error defineLogNode(
   return Error::Ok;
 }
 
+/*
+Define serialized gelu node into the subgraph, using the remapped ids
+to map the serialized ids, to the new ids generated when defining the
+tensor value
+*/
+Error defineGeluNode(
+    xnn_subgraph_t subgraph_ptr,
+    const std::unordered_map<uint32_t, uint32_t>& remapped_ids,
+    const NodePtr node,
+    const fb_xnnpack::XNNGraph* graph) noexcept {
+  MAYBE_UNUSED(graph);
+
+  auto graph_node = node->xnode_union_as_XNNGelu();
+
+  xnn_status status = xnn_define_gelu(
+      subgraph_ptr,
+      remapped_ids.at(graph_node->input_id()),
+      remapped_ids.at(graph_node->output_id()),
+      graph_node->flags());
+
+  ET_CHECK_OR_RETURN_ERROR(
+      status == xnn_status_success,
+      Internal,
+      "Failed to create gelu node %i with code: %s",
+      node->debug_handle(),
+      xnn_status_to_string(status));
+
+  return Error::Ok;
+}
+
 /*
 Define serialized ceiling node into the subgraph, using the remapped ids
 to map the serialized ids, to the new ids generated when defining the
@@ -2009,6 +2039,7 @@ DefineNodeFunc getDefineNodeFunc(fb_xnnpack::XNodeUnion nodeType) {
     _DEFINE(SquareRoot)
     _DEFINE(ReciprocalSquareRoot)
     _DEFINE(Ceiling)
+    _DEFINE(Gelu)
     _DEFINE(Hardswish)
     _DEFINE(LeakyReLU)
     _DEFINE(Log)
diff --git a/backends/xnnpack/serialization/runtime_schema.fbs b/backends/xnnpack/serialization/runtime_schema.fbs
@@ -140,6 +140,7 @@ union XNodeUnion {
   XNNConvTranspose2d: _XNNNodeConv,
   XNNReciprocalSquareRoot: _XNNNode1x1,
   XNNLog: _XNNNode1x1,
+  XNNGelu: _XNNNode1x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/schema.fbs b/backends/xnnpack/serialization/schema.fbs
@@ -136,6 +136,7 @@ union XNodeUnion {
   XNNConvTranspose2d: _XNNNodeConv,
   XNNReciprocalSquareRoot: _XNNNode1x1,
   XNNLog: _XNNNode1x1,
+  XNNGelu: _XNNNode1x1,
 }
 
 union XValueUnion {
diff --git a/backends/xnnpack/serialization/xnnpack_graph_schema.py b/backends/xnnpack/serialization/xnnpack_graph_schema.py
@@ -291,6 +291,11 @@ class XNNCeiling(XNNNode1x1):
     pass
 
 
+@dataclass
+class XNNGelu(XNNNode1x1):
+    pass
+
+
 @dataclass
 class XNNHardswish(XNNNode1x1):
     pass
@@ -385,6 +390,7 @@ class XNNScaledDotProductAttention:
     XNNBatchMatrixMultiply,
     XNNReciprocalSquareRoot,
     XNNLog,
+    XNNGelu,
 ]
 
 
diff --git a/backends/xnnpack/test/ops/test_gelu.py b/backends/xnnpack/test/ops/test_gelu.py
@@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.backends.xnnpack.test.tester import Tester
+
+
+class TestGelu(unittest.TestCase):
+    def setUp(self):
+        torch._dynamo.reset()
+
+    class Gelu(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.gelu = torch.nn.GELU()
+
+        def forward(self, x):
+            return self.gelu(x)
+
+    def run_gelu_test(self, inputs):
+        (
+            Tester(self.Gelu(), inputs)
+            .export()
+            .check_count({"torch.ops.aten.gelu.default": 1})
+            .to_edge_transform_and_lower()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .check_not(["executorch_exir_dialects_edge__ops_aten_gelu_default"])
+            .to_executorch()
+            .serialize()
+            .run_method_and_compare_outputs()
+        )
+
+    def test_fp16_gelu(self):
+        inputs = (torch.randn(20).to(torch.float16),)
+        self.run_gelu_test(inputs)
+
+    def test_fp32_gelu(self):
+        inputs = (torch.randn(20),)
+        self.run_gelu_test(inputs)
diff --git a/docs/source/using-executorch-ios.md b/docs/source/using-executorch-ios.md
@@ -135,7 +135,7 @@ sudo /Applications/CMake.app/Contents/bin/cmake-gui --install
 For example, the following command will build the ExecuTorch Runtime along with all available kernels and backends for the Apple platform in both Release and Debug modes:
 
 ```bash
-./scripts/build_apple_frameworks.sh --Release --Debug --coreml --mps --xnnpack --custom --optimized --portable --quantized
+./scripts/build_apple_frameworks.sh
 ```
 
 After the build finishes successfully, the resulting frameworks can be found in the `cmake-out` directory.
diff --git a/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md b/examples/demo-apps/apple_ios/LLaMA/docs/delegates/xnnpack_README.md
@@ -138,7 +138,7 @@ curl -LO "https://github.com/facebook/buck2/releases/download/${BUCK2_RELEASE_DA
 zstd -cdq "$BUCK2_ARCHIVE" > "$BUCK2" && chmod +x "$BUCK2"
 rm "$BUCK2_ARCHIVE"
 
-./scripts/build_apple_frameworks.sh --buck2="$(realpath $BUCK2)" --coreml --custom --mps --optimized --portable --quantized --xnnpack
+./scripts/build_apple_frameworks.sh
 ```
 
  After the build finishes successfully, the resulting frameworks can be found in the `cmake-out` directory. Copy them to your project and link them against your targets.
diff --git a/scripts/build_apple_frameworks.sh b/scripts/build_apple_frameworks.sh
@@ -9,6 +9,8 @@ set -euxo pipefail
 
 MODES=("Release" "Debug")
 PRESETS=("ios" "ios-simulator" "macos")
+# To support backwards compatibility, we want to retain the same output directory.
+PRESETS_RELATIVE_OUT_DIR=("ios" "simulator" "macos")
 
 SOURCE_ROOT_DIR=$(git rev-parse --show-toplevel)
 OUTPUT_DIR="${SOURCE_ROOT_DIR}/cmake-out"
@@ -146,20 +148,22 @@ done
 echo "Building libraries"
 
 rm -rf "${OUTPUT_DIR}"
-for preset in "${PRESETS[@]}"; do
+for preset_index in "${!PRESETS[@]}"; do
+  preset="${PRESETS[$preset_index]}"
+  preset_output_dir="${OUTPUT_DIR}/${PRESETS_RELATIVE_OUT_DIR[$preset_index]}"
+
   for mode in "${MODES[@]}"; do
-    output_dir="${OUTPUT_DIR}/${preset}"
-    echo "Building preset ${preset} (${mode}) in ${output_dir}..."
+    echo "Building preset ${preset} (${mode}) in ${preset_output_dir}..."
 
     # Do NOT add options here. Update the respective presets instead.
     cmake -S "${SOURCE_ROOT_DIR}" \
-          -B "${output_dir}" \
-          -DCMAKE_ARCHIVE_OUTPUT_DIRECTORY="${output_dir}" \
+          -B "${preset_output_dir}" \
+          -DCMAKE_ARCHIVE_OUTPUT_DIRECTORY="${preset_output_dir}" \
           -DCMAKE_BUILD_TYPE="${mode}" \
           ${CMAKE_OPTIONS_OVERRIDE[@]:-} \
           --preset "${preset}"
 
-    cmake --build "${output_dir}" \
+    cmake --build "${preset_output_dir}" \
           --config "${mode}" \
           -j$(sysctl -n hw.ncpu)
   done
@@ -224,9 +228,9 @@ append_framework_flag() {
 
 for mode in "${MODES[@]}"; do
   FRAMEWORK_FLAGS=()
-  for preset in "${PRESETS[@]}"; do
-    echo "Framework directory: ${preset}/${mode}"
-    FRAMEWORK_FLAGS+=("--directory=${preset}/${mode}")
+  for preset_out_dir in "${PRESETS_RELATIVE_OUT_DIR[@]}"; do
+    echo "Framework directory: ${preset_out_dir}/${mode}"
+    FRAMEWORK_FLAGS+=("--directory=${preset_out_dir}/${mode}")
   done
 
   append_framework_flag "" "$FRAMEWORK_EXECUTORCH" "$mode"
@@ -245,8 +249,8 @@ done
 
 echo "Cleaning up"
 
-for preset in "${PRESETS[@]}"; do
-  rm -rf "${OUTPUT_DIR}/${preset}/$preset"
+for preset_out_dir in "${PRESETS_RELATIVE_OUT_DIR[@]}"; do
+  rm -rf "${OUTPUT_DIR}/${preset_out_dir}"
 done
 
 rm -rf "$HEADERS_ABSOLUTE_PATH"

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,7 @@`
`65`	`65`	`exir_ops.edge.aten.addmm.default, # TODO(T163877189) add constraint for addmm`
`66`	`66`	`exir_ops.edge.aten.rsqrt.default,`
`67`	`67`	`exir_ops.edge.aten.log.default,`
	`68`	`+ exir_ops.edge.aten.gelu.default,`
`68`	`69`	`]`
`69`	`70`
`70`	`71`	`SUPPORTED_MODULES = [`
Original file line number	Diff line number	Diff line change
`@@ -140,6 +140,7 @@ union XNodeUnion {`
`140`	`140`	`XNNConvTranspose2d: _XNNNodeConv,`
`141`	`141`	`XNNReciprocalSquareRoot: _XNNNode1x1,`
`142`	`142`	`XNNLog: _XNNNode1x1,`
	`143`	`+ XNNGelu: _XNNNode1x1,`
`143`	`144`	`}`
`144`	`145`
`145`	`146`	`union XValueUnion {`
Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,7 @@ union XNodeUnion {`
`136`	`136`	`XNNConvTranspose2d: _XNNNodeConv,`
`137`	`137`	`XNNReciprocalSquareRoot: _XNNNode1x1,`
`138`	`138`	`XNNLog: _XNNNode1x1,`
	`139`	`+ XNNGelu: _XNNNode1x1,`
`139`	`140`	`}`
`140`	`141`
`141`	`142`	`union XValueUnion {`