pytorch
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 1 addition & 2 deletions b/‎.ci/scripts/utils.sh
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/vulkan/_passes/fuse_quantized_ops.py
Lines changed: 4 additions & 1 deletion b/‎backends/vulkan/_passes/fuse_quantized_ops.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/vulkan/_passes/tag_memory_meta_pass.py
Lines changed: 2 additions & 2 deletions b/‎backends/vulkan/_passes/tag_memory_meta_pass.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/vulkan/op_registry.py
Lines changed: 25 additions & 8 deletions b/‎backends/vulkan/op_registry.py
Lines changed: 25 additions & 8 deletions
diff --git a/‎backends/vulkan/partitioner/vulkan_partitioner.py
Lines changed: 4 additions & 3 deletions b/‎backends/vulkan/partitioner/vulkan_partitioner.py
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/tan.glsl
Lines changed: 60 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/tan.glsl
Lines changed: 60 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/tan.yaml
Lines changed: 13 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/tan.yaml
Lines changed: 13 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Permute.cpp
Lines changed: 83 additions & 37 deletions b/‎backends/vulkan/runtime/graph/ops/impl/Permute.cpp
Lines changed: 83 additions & 37 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/Permute.h
Lines changed: 3 additions & 3 deletions b/‎backends/vulkan/runtime/graph/ops/impl/Permute.h
Lines changed: 3 additions & 3 deletions
@@ -158,8 +158,7 @@ build_executorch_runner() {
 cmake_install_executorch_lib() {
   echo "Installing libexecutorch.a and libportable_kernels.a"
   clean_executorch_install_folders
-  retry cmake -DBUCK2="$BUCK" \
-          -DCMAKE_INSTALL_PREFIX=cmake-out \
+  retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
           -DCMAKE_BUILD_TYPE=Release \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
 
@@ -17,6 +17,7 @@
 from executorch.exir import ExportedProgram
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.passes import dead_code_elimination_pass
 
 #################
 ## linear_qcnw ##
@@ -224,6 +225,8 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
                 )
 
         graph_module.recompile()
-        graph_module = super().call(graph_module).graph_module
+        dead_code_elimination_pass(graph_module)
 
+        # Re-trace the graph since new nodes were (potentially) inserted
+        graph_module = super().call(graph_module).graph_module
         return PassResult(graph_module, True)
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 import logging
-from copy import deepcopy
 from typing import Any, Optional, Set
 
 import executorch.backends.vulkan.utils as utils
@@ -22,6 +21,7 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from executorch.exir.pass_base import ExportPass, PassResult
+from executorch.exir.tensor import TensorSpec
 
 logger: logging.Logger = logging.getLogger("")
 logger.setLevel(logging.INFO)
@@ -52,7 +52,7 @@ def insert_transition_node(
             (arg,),
         )
         clone_node.meta["val"] = arg.meta["val"]
-        clone_node.meta["spec"] = deepcopy(arg.meta["spec"])
+        clone_node.meta["spec"] = TensorSpec.from_tensor(clone_node.meta["val"])
         clone_node.meta["spec"].const = False
         set_memory_metadata(clone_node, storage, layout)
         arg.replace_all_uses_with(clone_node, lambda x, y=node: x == y)
 
@@ -230,6 +230,14 @@ def update_features_impl(op: OpKey):
         exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
         # Symbolic integer ops
         torch.ops.aten.sym_size.int,
+        operator.add,
+        operator.lt,
+        operator.gt,
+        operator.ge,
+        operator.le,
+        # Guard and assert ops
+        torch.ops.aten._assert_scalar.default,
+        torch.ops.aten.sym_constrain_range_for_size.default,
     ]
 )
 def register_ephemeral_op(features: OpFeatures):
@@ -500,7 +508,12 @@ def register_sdpa_with_kv_cache_op(features: OpFeatures):
     return features
 
 
-@update_features(["llama::update_cache", "llama::custom_sdpa"])
+@update_features(
+    [
+        "llama::update_cache",
+        "llama::custom_sdpa",
+    ]
+)
 def register_sdpa_ops(features: OpFeatures):
     features.resize_fn = False
     features.buffer_impl = False
@@ -520,8 +533,17 @@ def register_rotary_emb_op(features: OpFeatures):
     return features
 
 
-@update_features(exir_ops.edge.aten.view_copy.default)
-def register_view_op(features: OpFeatures):
+@update_features(
+    [
+        exir_ops.edge.aten.clone.default,
+        exir_ops.edge.aten.permute.default,
+        exir_ops.edge.aten.permute_copy.default,
+        exir_ops.edge.aten.select_copy.int,
+        exir_ops.edge.aten.slice_copy.Tensor,
+        exir_ops.edge.aten.view_copy.default,
+    ]
+)
+def register_view_ops(features: OpFeatures):
     features.texture_impl = TextureImplFeatures(
         valid_packed_dims=all_packed_dims,
     )
@@ -538,10 +560,8 @@ def register_view_op(features: OpFeatures):
         # Indexing and lookup
         exir_ops.edge.aten.flip.default,
         exir_ops.edge.aten.index_select.default,
-        exir_ops.edge.aten.select_copy.int,
         # Tensor creation
         exir_ops.edge.aten.arange.start_step,
-        exir_ops.edge.aten.clone.default,
         exir_ops.edge.aten.constant_pad_nd.default,
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.full_like.default,
@@ -564,12 +584,9 @@ def register_ported_op(features: OpFeatures):
 # Ops ported from PyTorch Vulkan backend. These ops are in a separate registry becasue they support all packed dimensions
 @update_features(
     [
-        # Indexing and lookup
-        exir_ops.edge.aten.slice_copy.Tensor,
         # Shape Manipulation
         exir_ops.edge.aten.squeeze_copy.dims,
         exir_ops.edge.aten.unsqueeze_copy.default,
-        exir_ops.edge.aten.permute_copy.default,
         # Tensor combination
         exir_ops.edge.aten.cat.default,
         exir_ops.edge.aten.repeat.default,
 
@@ -146,10 +146,11 @@ def op_node_is_compatible(  # noqa: C901: Function is too complex
     def node_is_compatible(
         self, node: torch.fx.Node, features: Optional[OpFeatures] = None
     ) -> Tuple[bool, str]:
-        if utils.is_symint_node(node):
-            return node.target in vulkan_supported_ops, "Op is compatible"
-        elif utils.is_tensor_node(node):
+        if utils.is_tensor_node(node):
             return self.op_node_is_compatible(node, features=features)
+        # For non-tensor nodes, just check if the op is registered
+        elif hasattr(node, "target"):
+            return node.target in vulkan_supported_ops, "Op is compatible"
 
         return False, f"Unsupported node type: {node.format_node()}"
 
 
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+#define T ${buffer_scalar_type(DTYPE)}
+
+${define_active_storage_type(STORAGE)}
+
+#include "indexing_utils.h"
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(0, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(1, "r", "t_in", DTYPE, STORAGE)}
+$if STORAGE == "buffer":
+  ${layout_declare_ubo(2, "int", "numel")}
+$else:
+  ${layout_declare_ubo(2, "ivec3", "out_limits")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+#include "activations.h"
+
+#ifdef USING_BUFFER
+
+void main() {
+  const int i = int(gl_GlobalInvocationID.x);
+  if (i >= numel) {
+    return;
+  }
+
+  float in_val = float(t_in[i]);
+  t_out[i] = T(tan(in_val));
+}
+
+#else
+
+void main() {
+  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(pos, out_limits))) {
+    return;
+  }
+
+  VEC4_T in_texel = texelFetch(t_in, pos, 0);
+  imageStore(t_out, pos, VEC4_T(tan(in_texel)));
+}
+
+#endif
@@ -0,0 +1,13 @@
+tan:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+    STORAGE:
+      - VALUE: texture3d
+      - VALUE: buffer
+  shader_variants:
+    - NAME: tan
@@ -25,10 +25,12 @@ using utils::uvec4;
 namespace {
 
 void check_args(
-    const api::vTensor& in,
-    const std::vector<int64_t>& permute_dims,
-    const api::vTensor& out) {
-  VK_CHECK_COND(check_same_packed_dim(in, out));
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef permute_dims,
+    const ValueRef out) {
+  (void)permute_dims;
+  VK_CHECK_COND(check_same_packed_dim(graph, in, out));
 
   // This implementation doesn't not requires the input tensor to have the same
   // dim size as the argument. The code will work as long as the input tensor's
@@ -38,40 +40,94 @@ void check_args(
 
 } // namespace
 
+void resize_permute_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args[0].refs[0];
+  const ValueRef in = args[1].refs[0];
+
+  const std::vector<int64_t> in_sizes = graph->sizes_of(in);
+  const std::vector<int64_t> out_sizes = graph->sizes_of(out);
+
+  const std::vector<int64_t> permute_dims =
+      graph->extract_int_or_symint_list(resize_args[0]);
+
+  if (in_sizes.size() == out_sizes.size() &&
+      in_sizes.size() == permute_dims.size()) {
+    std::vector<int64_t> new_out_sizes(out_sizes.size(), 1);
+    const int64_t out_ndim = std::max(in_sizes.size(), out_sizes.size());
+    for (int i = 0; i < out_ndim; i++) {
+      const int64_t permute_dim = permute_dims.at(i);
+      new_out_sizes.at(i) = in_sizes.at(permute_dim);
+    }
+    graph->virtual_resize(out, new_out_sizes);
+  }
+  // Case where permute is being used to implement squeeze
+  else if (
+      in_sizes.size() > out_sizes.size() &&
+      in_sizes.size() == permute_dims.size()) {
+    std::vector<int64_t> new_out_sizes(out_sizes.size(), 1);
+    const size_t offset = in_sizes.size() - out_sizes.size();
+    for (int i = 0; i < out_sizes.size(); i++) {
+      const int64_t permute_dim = permute_dims.at(i + offset);
+      new_out_sizes.at(i) = in_sizes.at(permute_dim);
+    }
+    graph->virtual_resize(out, new_out_sizes);
+  }
+  // Case where Permute is being used to implement unsqueeze
+  else if (
+      in_sizes.size() < out_sizes.size() &&
+      out_sizes.size() == permute_dims.size()) {
+    std::vector<int64_t> new_out_sizes(out_sizes.size(), 1);
+    const size_t offset = out_sizes.size() - in_sizes.size();
+    for (int i = 0; i < out_sizes.size(); i++) {
+      int64_t permute_dim = permute_dims.at(i) - offset;
+      if (permute_dim >= 0) {
+        new_out_sizes.at(i) = in_sizes.at(permute_dim);
+      }
+    }
+    graph->virtual_resize(out, new_out_sizes);
+  } else {
+    VK_THROW("Invalid permute dims");
+  }
+}
+
 void add_permute_node(
     ComputeGraph& graph,
-    ValueRef in,
-    const std::vector<int64_t>& permute_dims,
-    ValueRef out) {
-  vTensorPtr t_in = graph.get_tensor(in);
-  vTensorPtr t_out = graph.get_tensor(out);
-
-  check_args(*t_in, permute_dims, *t_out);
+    const ValueRef in,
+    const ValueRef permute_dims,
+    const ValueRef out) {
+  check_args(graph, in, permute_dims, out);
 
   ivec4 out_dims{0, 1, 2, 3};
 
   // Special cases of squeeze/unsqueeze. Because the input dim size can be
-  // different with output dim size. So pick t_in->dim() if squeeze, and
-  // t_out->dim() if unsqueeze to create parameter for permute.
-  int64_t out_ndim = std::max(t_in->dim(), t_out->dim());
+  // different with output dim size. So pick graph.dim_of(in) if squeeze, and
+  // graph.dim_of(out) if unsqueeze to create parameter for permute.
+  const int64_t out_ndim = std::max(graph.dim_of(in), graph.dim_of(out));
   std::vector<bool> seen(out_ndim);
-  for (int i = 0; i < out_ndim; i++) {
-    int64_t permute_dim = permute_dims[i];
-    VK_CHECK_COND(
-        !seen[permute_dim], "Argument dim ", permute_dim, "  is repeated");
-    seen[permute_dim] = true;
-
-    out_dims[(4u - out_ndim) + i] = permute_dim + (4 - out_ndim);
+  {
+    IntListPtr permute_dims_ptr = graph.get_int_list(permute_dims);
+    for (int i = 0; i < out_ndim; i++) {
+      int64_t permute_dim = permute_dims_ptr->at(i);
+      VK_CHECK_COND(
+          !seen[permute_dim], "Argument dim ", permute_dim, "  is repeated");
+      seen[permute_dim] = true;
+
+      out_dims[(4u - out_ndim) + i] =
+          utils::safe_downcast<int32_t>(permute_dim + (4 - out_ndim));
+    }
   }
 
   std::string kernel_name = "permute";
   kernel_name.reserve(kShaderNameReserve);
-  add_dtype_suffix(kernel_name, *t_out);
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
 
-  int32_t out_channels = dim_at<kChannel4D>(t_out->sizes());
-  int32_t in_channels = dim_at<kChannel4D>(t_in->sizes());
+  const int32_t out_channels = dim_at<kChannel4D>(graph.sizes_of(out));
+  const int32_t in_channels = dim_at<kChannel4D>(graph.sizes_of(in));
 
-  const auto packed_dim = graph.packed_dim_of(in);
+  const int32_t packed_dim = graph.packed_dim_of(in);
   ivec2 channel_info = {out_channels, in_channels};
   if (packed_dim == WHCN::kChannelsDim) {
     channel_info[0] = utils::align_up_4(channel_info[0]);
@@ -95,19 +151,9 @@ void add_permute_node(
       // Specialization Constants
       spec_vars,
       // Resize Args
-      {},
+      {permute_dims},
       // Resizing Logic
-      nullptr));
-}
-
-void add_permute_node(
-    ComputeGraph& graph,
-    ValueRef in,
-    ValueRef permute_dims_ref,
-    ValueRef out) {
-  IntListPtr permute_dims = graph.get_int_list(permute_dims_ref);
-
-  add_permute_node(graph, in, *permute_dims, out);
+      resize_permute_node));
 }
 
 void permute(ComputeGraph& graph, const std::vector<ValueRef>& args) {
 
@@ -18,8 +18,8 @@ namespace vkcompute {
 
 void add_permute_node(
     ComputeGraph& graph,
-    ValueRef in,
-    const std::vector<int64_t>& permute_dims,
-    ValueRef out);
+    const ValueRef in,
+    const ValueRef permute_dims,
+    const ValueRef out);
 
 } // namespace vkcompute