pytorch
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 6 additions & 3 deletions b/‎.ci/scripts/test_model.sh
Lines changed: 6 additions & 3 deletions
diff --git a/‎.ci/scripts/wheel/test_macos.py
Lines changed: 4 additions & 6 deletions b/‎.ci/scripts/wheel/test_macos.py
Lines changed: 4 additions & 6 deletions
diff --git a/‎.github/workflows/build-wheels-linux.yml
Lines changed: 3 additions & 0 deletions b/‎.github/workflows/build-wheels-linux.yml
Lines changed: 3 additions & 0 deletions
diff --git a/‎.github/workflows/build-wheels-macos.yml
Lines changed: 6 additions & 1 deletion b/‎.github/workflows/build-wheels-macos.yml
Lines changed: 6 additions & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 10 additions & 4 deletions b/‎.github/workflows/trunk.yml
Lines changed: 10 additions & 4 deletions
diff --git a/‎backends/arm/test/conftest.py
Lines changed: 4 additions & 1 deletion b/‎backends/arm/test/conftest.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/test/misc/test_debug_feats.py
Lines changed: 2 additions & 2 deletions b/‎backends/arm/test/misc/test_debug_feats.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/tosa_backend.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/tosa_backend.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/runtime/api/Context.cpp
Lines changed: 8 additions & 5 deletions b/‎backends/vulkan/runtime/api/Context.cpp
Lines changed: 8 additions & 5 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/clone.glsl
Lines changed: 6 additions & 7 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/clone.glsl
Lines changed: 6 additions & 7 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/clone.yaml
Lines changed: 1 addition & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/clone.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/full.glsl
Lines changed: 4 additions & 10 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/full.glsl
Lines changed: 4 additions & 10 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/full.yaml
Lines changed: 1 addition & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/full.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
Lines changed: 9 additions & 21 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/max_pool2d.glsl
Lines changed: 9 additions & 21 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml
Lines changed: 1 addition & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/max_pool2d.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/permute.glsl
Lines changed: 4 additions & 4 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/permute.glsl
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/permute.yaml
Lines changed: 1 addition & 0 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/permute.yaml
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
Lines changed: 4 additions & 3 deletions b/‎backends/vulkan/runtime/graph/ops/glsl/q_8w_linear.glsl
Lines changed: 4 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
Lines changed: 27 additions & 5 deletions b/‎backends/vulkan/runtime/graph/ops/impl/QuantizedLinear.cpp
Lines changed: 27 additions & 5 deletions
@@ -224,19 +224,22 @@ test_model_with_coreml() {
 
   "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision "${DTYPE}"
   EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
-  # TODO:
+
   if [ -n "$EXPORTED_MODEL" ]; then
     EXPORTED_MODEL_WITH_DTYPE="${EXPORTED_MODEL%.pte}_${DTYPE}.pte"
     mv "$EXPORTED_MODEL" "$EXPORTED_MODEL_WITH_DTYPE"
     EXPORTED_MODEL="$EXPORTED_MODEL_WITH_DTYPE"
-    echo "Renamed file path: $EXPORTED_MODEL"
+    echo "OK exported model: $EXPORTED_MODEL"
   else
-    echo "No .pte file found"
+    echo "[error] failed to export model: no .pte file found"
     exit 1
   fi
 
   # Run the model
   if [ "${should_test}" = true ]; then
+    echo "Installing requirements needed to build coreml_executor_runner..."
+    backends/apple/coreml/scripts/install_requirements.sh
+
     echo "Testing exported model with coreml_executor_runner..."
     local out_dir=$(mktemp -d)
     COREML_EXECUTOR_RUNNER_OUT_DIR="${out_dir}" examples/apple/coreml/scripts/build_executor_runner.sh
 
@@ -15,11 +15,9 @@
                 model=Model.Mv3,
                 backend=Backend.XnnpackQuantizationDelegation,
             ),
-            # Enable this once CoreML is suppported out-of-the-box
-            # https://github.com/pytorch/executorch/issues/9019
-            # test_base.ModelTest(
-            #     model=Model.Mv3,
-            #     backend=Backend.CoreMlTest,
-            # )
+            test_base.ModelTest(
+                model=Model.Mv3,
+                backend=Backend.CoreMlTest,
+            ),
         ]
     )
@@ -6,6 +6,9 @@ on:
     paths:
       - .ci/**/*
       - .github/workflows/build-wheels-linux.yml
+      - examples/**/*
+      - pyproject.toml
+      - setup.py
   push:
     branches:
       - nightly
 
@@ -6,6 +6,9 @@ on:
     paths:
       - .ci/**/*
       - .github/workflows/build-wheels-macos.yml
+      - examples/**/*
+      - pyproject.toml
+      - setup.py
   push:
     branches:
       - nightly
@@ -57,6 +60,8 @@ jobs:
       pre-script: ${{ matrix.pre-script }}
       post-script: ${{ matrix.post-script }}
       package-name: ${{ matrix.package-name }}
-      runner-type: macos-m1-stable
+      # Meta's macOS runners do not have Xcode, so use GitHub's runners.
+      runner-type: macos-latest-xlarge
+      setup-miniconda: true
       smoke-test-script: ${{ matrix.smoke-test-script }}
       trigger-event: ${{ github.event_name }}
@@ -65,22 +65,29 @@ jobs:
       matrix:
         model: [linear, add, add_mul, ic3, ic4, mv2, mv3, resnet18, resnet50, vit, w2l, mobilebert, emformer_join, emformer_transcribe]
         backend: [portable, xnnpack-quantization-delegation]
+        runner: [linux.arm64.2xlarge]
         include:
           - model: lstm
             backend: portable
+            runner: linux.arm64.2xlarge
           - model: mul
             backend: portable
+            runner: linux.arm64.2xlarge
           - model: softmax
             backend: portable
+            runner: linux.arm64.2xlarge
           - model: phi_4_mini
             backend: portable
+            runner: linux.arm64.m7g.4xlarge
           - model: qwen2_5
             backend: portable
+            runner: linux.arm64.2xlarge
           - model: llama3_2_vision_encoder
             backend: portable
+            runner: linux.arm64.2xlarge
       fail-fast: false
     with:
-      runner: linux.arm64.2xlarge
+      runner: ${{ matrix.runner }}
       docker-image: executorch-ubuntu-22.04-gcc11-aarch64
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -536,9 +543,8 @@ jobs:
         git clone https://github.com/huggingface/optimum-executorch
         cd optimum-executorch
         # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout 6a7e83f3eee2976fa809335bfb78a45b1ea1cb25
-        pip install .
-        pip install accelerate sentencepiece
+        git checkout 577a2b19670e4c643a5c6ecb09bf47b9a699e7c6
+        pip install .[tests]
         pip list
         echo "::endgroup::"
 
 
@@ -44,7 +44,10 @@ def pytest_configure(config):
             )
         # Only enable if we also have the TOSA reference model available.
         pytest._test_options["corstone_fvp"] = True  # type: ignore[attr-defined]
-    pytest._test_options["llama_inputs"] = config.option.llama_inputs  # type: ignore[attr-defined]
+
+    if getattr(config.option, "llama_inputs", False) and config.option.llama_inputs:
+        pytest._test_options["llama_inputs"] = config.option.llama_inputs  # type: ignore[attr-defined]
+
     pytest._test_options["fast_fvp"] = False  # type: ignore[attr-defined]
     if getattr(config.option, "fast_fvp", False):
         pytest._test_options["fast_fvp"] = config.option.fast_fvp  # type: ignore[attr-defined]
 
@@ -197,10 +197,10 @@ def test_collate_tosa_BI_tests(self):
             "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests"
         )
         assert os.path.exists(
-            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/output_tag6.tosa"
+            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/output_tag6_TOSA-0.80+BI.tosa"
         )
         assert os.path.exists(
-            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/desc_tag6.json"
+            "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/desc_tag6_TOSA-0.80+BI.json"
         )
 
         os.environ.pop("TOSA_TESTCASES_BASE_PATH")
 
@@ -125,7 +125,7 @@ def preprocess(  # noqa: C901
             dbg_tosa_dump(
                 tosa_graph,
                 artifact_path,
-                suffix="{}".format(f"_{tag}" if tag else ""),
+                suffix="{}".format(f"_{tag}" if tag else "") + (f"_{tosa_spec}"),
             )
 
         # Serialize and return the TOSA flatbuffer.
 
@@ -272,7 +272,7 @@ Context* context() {
 
 VkPipeline Context::get_shader_pipeline(
     const vkapi::ShaderInfo& shader,
-    const vkapi::SpecVarList& spec_constants) {
+    const vkapi::SpecVarList& additional_constants) {
   const uint32_t push_constants_size = 128u;
 
   VkDescriptorSetLayout shader_layout =
@@ -281,12 +281,15 @@ VkPipeline Context::get_shader_pipeline(
       pipeline_layout_cache().retrieve(shader_layout, push_constants_size);
 
   const utils::WorkgroupSize local_workgroup_size(4u, 4u, 1u);
+  vkapi::SpecVarList spec_constants = {
+      SV(local_workgroup_size[0u]),
+      SV(local_workgroup_size[1u]),
+      SV(local_workgroup_size[2u])};
+
+  spec_constants.append(additional_constants);
 
   VkPipeline pipeline = pipeline_cache().retrieve(
-      {pipeline_layout,
-       shader_cache().retrieve(shader),
-       spec_constants,
-       local_workgroup_size});
+      {pipeline_layout, shader_cache().retrieve(shader), spec_constants});
 
   return pipeline;
 }
 
@@ -8,16 +8,15 @@
 
 #version 450 core
 
+#include "indexing_utils.h"
+
 #define PRECISION ${PRECISION}
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 2) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec3", "out_limits")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -26,5 +25,5 @@ void main() {
   if (any(greaterThanEqual(pos, out_limits))) {
     return;
   }
-  imageStore(image_out, pos, texelFetch(image_in, pos, 0));
+  imageStore(t_out, pos, load_texel(t_in, pos));
 }
@@ -2,6 +2,7 @@ clone:
   parameter_names_with_default_values:
     DTYPE: float
     NDIM: 3
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
 
@@ -18,15 +18,9 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-
-layout(set = 0, binding = 1) uniform PRECISION restrict Sizes {
-  ivec4 sizes;
-};
-
-layout(set = 0, binding = 2) uniform PRECISION restrict FillVal {
-  float fill_value;
-};
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec4", "sizes")}
+${layout_declare_ubo(B, "float", "fill_value")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -50,5 +44,5 @@ void main() {
     outtex = outtex * valid_idx;
   }
 
-  imageStore(image_out, POS, outtex);
+  imageStore(t_out, POS, outtex);
 }
@@ -9,6 +9,7 @@ full:
     NDIM: 3
     DTYPE: float
     PACKING: C_packed
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
 
@@ -15,24 +15,12 @@
 
 layout(std430) buffer;
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1, ${IMAGE_FORMAT["int"]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM]["int"]} image_idx;
-layout(set = 0, binding = 2) uniform PRECISION sampler3D image_in;
-
-layout(set = 0, binding = 3) uniform PRECISION restrict OutLimits {
-  ivec3 out_limits;
-};
-
-layout(set = 0, binding = 4) uniform PRECISION restrict InSizes {
-  ivec4 in_sizes;
-};
-
-layout(set = 0, binding = 5) uniform PRECISION restrict Params {
-  ivec2 kernel_size;
-  ivec2 stride;
-  ivec2 padding;
-  ivec2 dilation;
-};
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "w", "t_idx", "int", STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
+${layout_declare_ubo(B, "ivec3", "out_limits")}
+${layout_declare_ubo(B, "ivec4", "in_sizes")}
+${layout_declare_ubo(B, "ivec2", "kernel_size", "ivec2", "stride", "ivec2", "padding", "ivec2", "dilation")}
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
@@ -54,7 +42,7 @@ void main() {
   for (int y = start.y; y < end.y; y += dilation.y) {
     for (int x = start.x; x < end.x; x += dilation.x) {
       if ((x >= 0 && x < in_sizes.x) && (y >= 0 && y < in_sizes.y)) {
-        const vec4 cur_texel = texelFetch(image_in, ivec3(x, y, pos.z), 0);
+        const vec4 cur_texel = load_texel(t_in, ivec3(x, y, pos.z));
 
         // Set idx if value is greatest in the pool; else, keep the existing idx.
         ivec4 cur_idx = ivec4(x + int(in_sizes.x) * y);
@@ -66,6 +54,6 @@ void main() {
     }
   }
 
-  imageStore(image_out, pos, out_texel);
-  imageStore(image_idx, pos, idx_texel);
+  imageStore(t_out, pos, out_texel);
+  imageStore(t_idx, pos, idx_texel);
 }
@@ -8,6 +8,7 @@ max_pool2d:
   parameter_names_with_default_values:
     NDIM: 3
     DTYPE: float
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
 
@@ -16,8 +16,8 @@ layout(std430) buffer;
 
 #include "indexing_utils.h"
 
-layout(set = 0, binding = 0, ${IMAGE_FORMAT[DTYPE]}) uniform PRECISION restrict writeonly ${IMAGE_T[NDIM][DTYPE]} image_out;
-layout(set = 0, binding = 1) uniform PRECISION ${SAMPLER_T[NDIM][DTYPE]} image_in;
+${layout_declare_tensor(B, "w", "t_out", DTYPE, STORAGE)}
+${layout_declare_tensor(B, "r", "t_in", DTYPE, STORAGE)}
 
 layout(push_constant) uniform PRECISION restrict Block {
   ivec4 out_limits;
@@ -72,7 +72,7 @@ void main() {
     fetch_pos[packed_dim] >>= 2;
 
     // fetch input texel
-    VEC4_T inval = VEC4_T(texelFetch(image_in, fetch_pos, 0));
+    VEC4_T inval = VEC4_T(load_texel(t_in, fetch_pos));
     outval[j] = inval[in_packed_dim_lane_index];
 
     // go to next position in the input, that is mapped to the packed dim in the output
@@ -81,5 +81,5 @@ void main() {
 
   pos[packed_dim] = int(gl_GlobalInvocationID[packed_dim]);
 
-  imageStore(image_out, pos, outval);
+  imageStore(t_out, pos, outval);
 }
@@ -2,6 +2,7 @@ permute:
   parameter_names_with_default_values:
     DTYPE: float
     NDIM: 3
+    STORAGE: texture3d
   generate_variant_forall:
     DTYPE:
       - VALUE: half
 
@@ -90,9 +90,10 @@ void main() {
 
 void main() {
   const u16vec2 out_pos = u16vec2(
-    gl_GlobalInvocationID.x / out_limits.y,
-    gl_GlobalInvocationID.x % out_limits.y);
-  if (out_pos.x >= out_limits.x) {
+    gl_GlobalInvocationID.x,
+    gl_GlobalInvocationID.y);
+
+  if (out_pos.x >= out_limits.x || out_pos.y >= out_limits.y) {
     return;
   }
 
 
@@ -114,15 +114,37 @@ void add_q_8w_linear_node(
          graph.sizes_ubo(mat1_W_packed)});
   }
 
-  // set global work group size to be 1 dimensional
-  const utils::uvec3 wg_size = {
-      static_cast<uint32_t>(graph.numel_of(out_W_packed)), 1, 1};
+  utils::uvec3 global_wg;
+  if (graph.is_buffer_storage(out)) {
+    global_wg = {static_cast<uint32_t>(graph.numel_of(out_W_packed)), 1, 1};
+  } else {
+    global_wg = graph.logical_limits_of(out_W_packed);
+  }
+
+  utils::uvec3 local_wg{8, 8, 1};
+  int32_t out_W = graph.size_at<int32_t>(-1, out_W_packed);
+
+  if (graph.is_buffer_storage(out_W_packed)) {
+    local_wg[0] = 64;
+    local_wg[1] = 1;
+    local_wg[2] = 1;
+  } else {
+    if (out_W % 8 != 0) {
+      if (out_W % 4 == 0) {
+        local_wg[0] = 4;
+        local_wg[1] = 16;
+      } else {
+        local_wg[0] = 2;
+        local_wg[1] = 32;
+      }
+    }
+  }
 
   graph.execute_nodes().emplace_back(new DispatchNode(
       graph,
       VK_KERNEL_FROM_STR(kernel_name),
-      wg_size,
-      graph.create_local_wg_size(wg_size),
+      global_wg,
+      local_wg,
       // Inputs and Outputs
       {{out_W_packed, vkapi::MemoryAccessType::WRITE},
        {{mat1_W_packed, q_mat2, scales}, vkapi::MemoryAccessType::READ}},
Original file line number	Diff line number	Diff line change
`@@ -197,10 +197,10 @@ def test_collate_tosa_BI_tests(self):`
`197`	`197`	`"test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests"`
`198`	`198`	`)`
`199`	`199`	`assert os.path.exists(`
`200`		`- "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/output_tag6.tosa"`
	`200`	`+ "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/output_tag6_TOSA-0.80+BI.tosa"`
`201`	`201`	`)`
`202`	`202`	`assert os.path.exists(`
`203`		`- "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/desc_tag6.json"`
	`203`	`+ "test_collate_tosa_tests/tosa-bi/TestCollateTosaTests/test_collate_tosa_BI_tests/desc_tag6_TOSA-0.80+BI.json"`
`204`	`204`	`)`
`205`	`205`
`206`	`206`	`os.environ.pop("TOSA_TESTCASES_BASE_PATH")`
Original file line number	Diff line number	Diff line change
`@@ -125,7 +125,7 @@ def preprocess( # noqa: C901`
`125`	`125`	`dbg_tosa_dump(`
`126`	`126`	`tosa_graph,`
`127`	`127`	`artifact_path,`
`128`		`- suffix="{}".format(f"_{tag}" if tag else ""),`
	`128`	`+ suffix="{}".format(f"_{tag}" if tag else "") + (f"_{tosa_spec}"),`
`129`	`129`	`)`
`130`	`130`
`131`	`131`	`# Serialize and return the TOSA flatbuffer.`