pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 3 additions & 7 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 3 additions & 7 deletions
diff --git a/‎backends/apple/mps/setup.md
Lines changed: 2 additions & 2 deletions b/‎backends/apple/mps/setup.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/test/common.py
Lines changed: 8 additions & 1 deletion b/‎backends/arm/test/common.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎backends/arm/test/ops/test_conv_combos.py
Lines changed: 4 additions & 0 deletions b/‎backends/arm/test/ops/test_conv_combos.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/arm/test/ops/test_split.py
Lines changed: 4 additions & 1 deletion b/‎backends/arm/test/ops/test_split.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.cpp
Lines changed: 10 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.cpp
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.h
Lines changed: 13 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.h
Lines changed: 13 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/containers/SharedObject.cpp
Lines changed: 4 additions & 0 deletions b/‎backends/vulkan/runtime/graph/containers/SharedObject.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/containers/SharedObject.h
Lines changed: 1 addition & 0 deletions b/‎backends/vulkan/runtime/graph/containers/SharedObject.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl
Lines changed: 47 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.glsl
Lines changed: 47 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml
Lines changed: 10 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/repeat_interleave.yaml
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp
Lines changed: 94 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.cpp
Lines changed: 94 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h
Lines changed: 24 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h
Lines changed: 24 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/vk_api/memory/Buffer.h
Lines changed: 3 additions & 1 deletion b/‎backends/vulkan/runtime/vk_api/memory/Buffer.h
Lines changed: 3 additions & 1 deletion
@@ -1 +1 @@
-aec9b2ab77389967ef39bb9c10662fd0fe3e185a
+5ba404f68775bb06a1125a100687f86b6d6de6a8
@@ -176,8 +176,8 @@ jobs:
         fi
         echo "::endgroup::"
 
-  build-llm-demo:
-    name: build-llm-demo
+  build-benchmark-app:
+    name: build-benchmark-app
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     needs: set-parameters
     with:
@@ -211,7 +211,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
     needs:
       - set-parameters
-      - build-llm-demo
+      - build-benchmark-app
       - export-models
     strategy:
       matrix:
@@ -228,10 +228,6 @@ jobs:
       # This is the ARN of ExecuTorch project on AWS
       project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
       device-pool-arn: ${{ matrix.device }}
-      # Uploaded to S3 from the previous job, the name of the app comes from the project itself.
-      # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
-      # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
-      # one app+flavor that could load and run the model.
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
       # NB: Need to set the default spec here so that it works for periodic too
 
@@ -111,12 +111,12 @@ python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --no-use_fp
 ```
 
 ### Profiling:
-1. [Optional] Generate an [ETRecord](./sdk-etrecord.rst) while you're exporting your model.
+1. [Optional] Generate an [ETRecord](./etrecord.rst) while you're exporting your model.
 ```bash
 cd executorch
 python3 -m examples.apple.mps.scripts.mps_example --model_name="mv3" --generate_etrecord -b
 ```
-2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./sdk-etdump.md).
+2. Run your Program on the ExecuTorch runtime and generate an [ETDump](./etdump.md).
 ```
 ./cmake-out/examples/apple/mps/mps_executor_runner --model_path mv3_mps_bundled_fp16.pte --bundled_program --dump-outputs
 ```
 
@@ -6,6 +6,7 @@
 
 import logging
 import os
+import platform
 import shutil
 import subprocess
 import sys
@@ -57,11 +58,17 @@ def pytest_collection_modifyitems(config, items):
 
 
 def load_libquantized_ops_aot_lib():
+    so_ext = {
+        "Darwin": "dylib",
+        "Linux": "so",
+        "Windows": "dll",
+    }.get(platform.system(), None)
+
     find_lib_cmd = [
         "find",
         "cmake-out-aot-lib",
         "-name",
-        "libquantized_ops_aot_lib.so",
+        f"libquantized_ops_aot_lib.{so_ext}",
     ]
     res = subprocess.run(find_lib_cmd, capture_output=True)
     if res.returncode == 0:
 
@@ -9,6 +9,8 @@
 
 from typing import Tuple
 
+import pytest
+
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
@@ -311,6 +313,8 @@ def test_block_bottleneck_residual_tosa_MI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_MI_pipeline(model, model.get_inputs())
 
+    # TODO: Investigate flakyness (MLTORCH-307)
+    @pytest.mark.flaky(reruns=3)
     def test_block_bottleneck_residual_tosa_BI(self):
         model = ComboBlockBottleneckResidual()
         self._test_conv_combo_tosa_BI_pipeline(model, model.get_inputs())
 
@@ -124,8 +124,11 @@ def test_split_with_sizes_tosa_MI(self, test_data: test_data_t):
         self._test_split_tosa_MI_pipeline(self.SplitWithSizes(), test_data)
 
     @parameterized.expand(Split.test_data)
-    def test_split_n_out_tosa_MI(self, test_data: test_data_t):
+    def test_split_one_out_tosa_MI(self, test_data: test_data_t):
         self._test_split_tosa_MI_pipeline(self.SplitSingleOut(), test_data)
+
+    @parameterized.expand(Split.test_data)
+    def test_split_two_out_tosa_MI(self, test_data: test_data_t):
         self._test_split_tosa_MI_pipeline(self.SplitTwoOut(), test_data)
 
     @parameterized.expand(Split.test_data)
 
@@ -300,6 +300,11 @@ ValueRef ComputeGraph::add_tensor_view(const ValueRef vref) {
   const vTensorPtr t = get_tensor(vref);
   ValueRef idx(static_cast<int>(values_.size()));
   values_.emplace_back(api::vTensor(*t));
+  for (SharedObject& sobj : shared_objects_) {
+    if (sobj.has_user(vref)) {
+      sobj.add_user(this, idx);
+    }
+  }
   return idx;
 }
 
@@ -311,6 +316,11 @@ ValueRef ComputeGraph::add_tensor_view(
   const vTensorPtr t = get_tensor(vref);
   ValueRef idx(static_cast<int>(values_.size()));
   values_.emplace_back(api::vTensor(*t, sizes, strides, offset_numel));
+  for (SharedObject& sobj : shared_objects_) {
+    if (sobj.has_user(vref)) {
+      sobj.add_user(this, idx);
+    }
+  }
   return idx;
 }
 
 
@@ -378,6 +378,19 @@ class ComputeGraph final {
     return values_.at(idx).toString();
   }
 
+  template <
+      typename T,
+      typename std::enable_if<
+          std::is_integral<T>::value && std::is_signed<T>::value,
+          int>::type = 0>
+  T extract_whcn_dim(const ValueRef idx, const int64_t ndim) {
+    T dim = extract_scalar<T>(idx);
+    // Normalize dim to account for negative indexing
+    dim = (dim % ndim + ndim) % ndim;
+    // Assume original value is NCHW ordering, obtain the WHCN ordering
+    return ndim - 1 - dim;
+  }
+
   //
   // Utility functions
   //
 
@@ -12,6 +12,10 @@
 
 namespace vkcompute {
 
+bool SharedObject::has_user(const ValueRef idx) const {
+  return std::find(users.begin(), users.end(), idx) != users.end();
+}
+
 void SharedObject::add_user(ComputeGraph* const graph, const ValueRef idx) {
   vTensorPtr t = graph->get_tensor(idx);
 
 
@@ -31,6 +31,7 @@ struct SharedObject {
   std::vector<ValueRef> users;
   vkapi::Allocation allocation;
 
+  bool has_user(const ValueRef idx) const;
   void add_user(ComputeGraph* const graph, const ValueRef idx);
   void allocate(ComputeGraph* const graph);
   void bind_users(ComputeGraph* const graph);
 
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#version 450 core
+
+#define PRECISION ${PRECISION}
+
+#define VEC4_T ${texel_load_type(DTYPE, STORAGE)}
+
+${define_required_extensions(DTYPE)}
+
+layout(std430) buffer;
+
+${layout_declare_tensor(B, "w", "tout", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "tin", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec3", "tin_limits")}
+${layout_declare_ubo(B, "ivec4", "tin_axis_map")}
+${layout_declare_ubo(B, "ivec4", "tout_axis_map")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+layout(constant_id = 3) const int nrepeats = 1;
+layout(constant_id = 4) const int repeat_dim = 1;
+
+#include "indexing_utils.h"
+
+void main() {
+  const ivec3 tin_lpos = ivec3(gl_GlobalInvocationID);
+
+  if (any(greaterThanEqual(tin_lpos, tin_limits))) {
+    return;
+  }
+
+  const VEC4_T intex = load_texel_lpos(tin, tin_lpos, tin_axis_map);
+
+  ivec3 tout_lpos = tin_lpos;
+  tout_lpos[repeat_dim] *= nrepeats;
+
+  for (int i = 0; i < nrepeats; ++i, tout_lpos[repeat_dim]++) {
+    write_texel_lpos(tout, tout_lpos, intex, tout_axis_map);
+  }
+}
@@ -0,0 +1,10 @@
+repeat_interleave:
+  parameter_names_with_default_values:
+    DTYPE: float
+    STORAGE: texture3d
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: half
+      - VALUE: float
+  shader_variants:
+    - NAME: repeat_interleave
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/vulkan/runtime/graph/ops/OperatorRegistry.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/RepeatInterleave.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/utils/TensorUtils.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+namespace vkcompute {
+
+void resize_repeat_interleave_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& extra_args) {
+  (void)extra_args;
+  vTensorPtr out = graph->get_tensor(args[0].refs[0]);
+  vTensorPtr in = graph->get_tensor(args[1].refs[0]);
+
+  const int64_t nrepeats = graph->extract_scalar<int64_t>(extra_args[0]);
+  int64_t repeat_dim = graph->extract_scalar<int64_t>(extra_args[1]);
+
+  std::vector<int64_t> new_sizes = in->sizes();
+  repeat_dim = normalize(repeat_dim, new_sizes.size());
+  new_sizes.at(repeat_dim) *= nrepeats;
+
+  out->virtual_resize(new_sizes);
+}
+
+void add_repeat_interleave_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef num_repeats,
+    const ValueRef dim,
+    const ValueRef out) {
+  const int32_t nrepeats = graph.extract_scalar<int32_t>(num_repeats);
+  const int32_t repeat_dim =
+      graph.extract_whcn_dim<int32_t>(dim, graph.dim_of(in));
+
+  VK_CHECK_COND(repeat_dim != graph.packed_dim_of(out));
+  VK_CHECK_COND(repeat_dim != graph.packed_dim_of(in));
+
+  std::string kernel_name = "repeat_interleave";
+  add_dtype_suffix(kernel_name, graph.dtype_of(out));
+
+  const utils::uvec3 global_wg_size = graph.logical_limits_of(in);
+  const utils::uvec3 local_wg_size = graph.create_local_wg_size(global_wg_size);
+
+  graph.execute_nodes().emplace_back(new ExecuteNode(
+      graph,
+      // Shader
+      VK_KERNEL_FROM_STR(kernel_name),
+      // Workgroup sizes
+      global_wg_size,
+      local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::MemoryAccessType::WRITE},
+       {in, vkapi::MemoryAccessType::READ}},
+      // Parameter buffers
+      {graph.logical_limits_ubo(in),
+       graph.axis_map_ubo(in),
+       graph.axis_map_ubo(out)},
+      // Specialization Constants
+      {nrepeats, repeat_dim},
+      // Resizing Logic
+      resize_repeat_interleave_node,
+      {num_repeats, dim}));
+}
+
+void repeat_interleave(ComputeGraph& graph, const std::vector<ValueRef>& args) {
+  int args_i = 0;
+  const ValueRef in = args[args_i++];
+  const ValueRef num_repeats = args[args_i++];
+  const ValueRef dim = args[args_i++];
+  const ValueRef output_size = args[args_i++];
+  const ValueRef out = args[args_i++];
+
+  // Output size is not used in the kernel
+  (void)output_size;
+
+  add_repeat_interleave_node(graph, in, num_repeats, dim, out);
+}
+
+REGISTER_OPERATORS {
+  VK_REGISTER_OP(aten.repeat_interleave.self_int, repeat_interleave);
+}
+
+} // namespace vkcompute
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/vulkan/runtime/api/api.h>
+
+#include <executorch/backends/vulkan/runtime/graph/ComputeGraph.h>
+
+namespace vkcompute {
+
+void add_repeat_interleave_node(
+    ComputeGraph& graph,
+    const ValueRef in,
+    const ValueRef num_repeats,
+    const ValueRef dim,
+    const ValueRef out);
+
+} // namespace vkcompute
@@ -161,7 +161,9 @@ class VulkanBuffer final {
 
   inline void bind_allocation(const Allocation& memory) {
     VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");
-    VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
+    if (!is_copy_) {
+      VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));
+    }
     memory_.allocation = memory.allocation;
   }
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-aec9b2ab77389967ef39bb9c10662fd0fe3e185a`
	`1`	`+5ba404f68775bb06a1125a100687f86b6d6de6a8`
Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,9 @@ class VulkanBuffer final {`
`161`	`161`
`162`	`162`	`inline void bind_allocation(const Allocation& memory) {`
`163`	`163`	`VK_CHECK_COND(!memory_, "Cannot bind an already bound allocation!");`
`164`		`- VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));`
	`164`	`+ if (!is_copy_) {`
	`165`	`+ VK_CHECK(vmaBindBufferMemory(allocator_, memory.allocation, handle_));`
	`166`	`+ }`
`165`	`167`	`memory_.allocation = memory.allocation;`
`166`	`168`	`}`
`167`	`169`