pytorch
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 19 additions & 23 deletions b/‎.ci/scripts/test_model.sh
Lines changed: 19 additions & 23 deletions
diff --git a/‎.github/scripts/label_utils.py
Lines changed: 1 addition & 3 deletions b/‎.github/scripts/label_utils.py
Lines changed: 1 addition & 3 deletions
diff --git a/‎.github/scripts/trymerge.py
Lines changed: 1 addition & 9 deletions b/‎.github/scripts/trymerge.py
Lines changed: 1 addition & 9 deletions
diff --git a/‎.github/workflows/check-labels.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/check-labels.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/replace_ops.py
Lines changed: 7 additions & 5 deletions b/‎backends/cadence/aot/replace_ops.py
Lines changed: 7 additions & 5 deletions
diff --git a/‎backends/cadence/aot/tests/test_replace_ops_passes.py
Lines changed: 32 additions & 8 deletions b/‎backends/cadence/aot/tests/test_replace_ops_passes.py
Lines changed: 32 additions & 8 deletions
diff --git a/‎backends/vulkan/runtime/VulkanBackend.cpp
Lines changed: 7 additions & 0 deletions b/‎backends/vulkan/runtime/VulkanBackend.cpp
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.cpp
Lines changed: 20 additions & 2 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.cpp
Lines changed: 20 additions & 2 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.h
Lines changed: 26 additions & 1 deletion b/‎backends/vulkan/runtime/graph/ComputeGraph.h
Lines changed: 26 additions & 1 deletion
@@ -49,14 +49,24 @@ prepare_artifacts_upload() {
 }
 
 build_cmake_executor_runner() {
+  local backend_string_select="${1:-}"
   echo "Building executor_runner"
   rm -rf ${CMAKE_OUTPUT_DIR}
-  cmake -DCMAKE_BUILD_TYPE=Debug \
-      -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-      -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-      -B${CMAKE_OUTPUT_DIR} .
-
-  cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
+  mkdir ${CMAKE_OUTPUT_DIR}
+  if [[ "$backend_string_select" == "XNNPACK" ]]; then
+    echo "Backend $backend_string_select selected"
+    (cd ${CMAKE_OUTPUT_DIR} \
+      && cmake -DCMAKE_BUILD_TYPE=Release \
+        -DEXECUTORCH_BUILD_XNNPACK=ON \
+        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
+    cmake --build ${CMAKE_OUTPUT_DIR} -j4
+  else
+    cmake -DCMAKE_BUILD_TYPE=Debug \
+        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+        -B${CMAKE_OUTPUT_DIR} .
+    cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
+  fi
 }
 
 run_portable_executor_runner() {
@@ -111,19 +121,6 @@ test_model() {
   run_portable_executor_runner
 }
 
-build_cmake_xnn_executor_runner() {
-  echo "Building xnn_executor_runner"
-
-  (rm -rf ${CMAKE_OUTPUT_DIR} \
-    && mkdir ${CMAKE_OUTPUT_DIR} \
-    && cd ${CMAKE_OUTPUT_DIR} \
-    && retry cmake -DCMAKE_BUILD_TYPE=Release \
-      -DEXECUTORCH_BUILD_XNNPACK=ON \
-      -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
-
-  cmake --build ${CMAKE_OUTPUT_DIR} -j4
-}
-
 test_model_with_xnnpack() {
   WITH_QUANTIZATION=$1
   WITH_DELEGATION=$2
@@ -148,12 +145,11 @@ test_model_with_xnnpack() {
 
   # Run test model
   if [[ "${BUILD_TOOL}" == "buck2" ]]; then
+    # TODO eventually buck should also use consolidated executor runners
     buck2 run //examples/xnnpack:xnn_executor_runner -- --model_path "${OUTPUT_MODEL_PATH}"
   elif [[ "${BUILD_TOOL}" == "cmake" ]]; then
-    if [[ ! -f ${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner ]]; then
-      build_cmake_xnn_executor_runner
-    fi
-    ./${CMAKE_OUTPUT_DIR}/backends/xnnpack/xnn_executor_runner --model_path "${OUTPUT_MODEL_PATH}"
+    build_cmake_executor_runner "XNNPACK"
+    ./${CMAKE_OUTPUT_DIR}/executor_runner --model_path "${OUTPUT_MODEL_PATH}"
   else
     echo "Invalid build tool ${BUILD_TOOL}. Only buck2 and cmake are supported atm"
     exit 1
 
@@ -22,9 +22,7 @@
 
 LABEL_ERR_MSG_TITLE = "This PR needs a `release notes:` label"
 LABEL_ERR_MSG = f"""# {LABEL_ERR_MSG_TITLE}
-If your change should be included in the release notes (i.e. would users of this library care about this change?), please use a label starting with `release notes:`.
-
-If not, please add the `release notes: none` label.
+If your change should be included in the release notes (i.e. would users of this library care about this change?), please use a label starting with `release notes:`. This helps us keep track and include your important work in the next release notes.
 
 To add a label, you can comment to pytorchbot, for example
 `@pytorchbot label "release notes: none"`
 
@@ -59,12 +59,7 @@
     patterns_to_regex,
     retries_decorator,
 )
-from label_utils import (
-    gh_add_labels,
-    gh_remove_label,
-    has_required_labels,
-    LABEL_ERR_MSG,
-)
+from label_utils import gh_add_labels, gh_remove_label
 from trymerge_explainer import get_revert_message, TryMergeExplainer
 
 # labels
@@ -2116,9 +2111,6 @@ def merge(
     # Check for approvals
     find_matching_merge_rule(pr, repo, skip_mandatory_checks=True)
 
-    if not has_required_labels(pr):
-        raise RuntimeError(LABEL_ERR_MSG.lstrip(" #"))
-
     if ignore_current:
         checks = pr.get_checkrun_conclusions()
         _, failing, _ = categorize_checks(
 
@@ -51,4 +51,4 @@ jobs:
           PR_NUM: ${{ github.event.number || github.event.inputs.pr_number }}
         run: |
           set -ex
-          python3 .github/scripts/check_labels.py --exit-non-zero "${PR_NUM}"
+          python3 .github/scripts/check_labels.py "${PR_NUM}"
@@ -2065,11 +2065,10 @@ def call_operator(
         return super().call_operator(op, args, kwargs, meta)
 
 
-@register_cadence_pass(CadencePassAttribute(opt_level=2))
-class ReplaceGeluWithApproximateGeluPass(ExportPass):
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class ReplaceAtenApproxGeluWithApproxGeluPass(ExportPass):
     """
-    Replace the gelu op with an approximate gelu op. The approximate gelu op
-    is more efficient on DSP backends.
+    Replace the aten gelu op with an approximate arg with an approximate gelu op.
     """
 
     def call_operator(
@@ -2079,6 +2078,9 @@ def call_operator(
         kwargs: Dict[str, Argument],
         meta: NodeMetadata,
     ) -> ProxyValue:
+        if "approximate" not in kwargs:
+            return super().call_operator(op, args, kwargs, meta)
+
         if op not in {
             exir_ops.edge.aten.gelu.default,
         }:
@@ -2414,7 +2416,7 @@ class CadenceReplaceOpsInGraph:
         ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass,
         ReplaceAtenAvgPoolWithJarvisAvgPoolPass,
         ReplaceWhereWithFullArgsWithWhereScalar,
-        ReplaceGeluWithApproximateGeluPass,
+        ReplaceAtenApproxGeluWithApproxGeluPass,
         ReplaceSplitWithSlicePass,
         ReplacePowWithMulPass,
     ]
@@ -26,13 +26,13 @@
     ForceChannelLastForConvPass,
     MakeSliceAndCatDimOutermostPass,
     ReplaceAddMMWithLinearPass,
+    ReplaceAtenApproxGeluWithApproxGeluPass,
     ReplaceAtenConvolutionWithJarvisConvolutionPass,
     ReplaceConstantPadNdWithSlicePass,
     ReplaceConvolutionOptionalArgsWithConcreteArgsPass,
     ReplaceConvWithIm2RowAndLinear,
     ReplaceEmptyTensorsWithFullPass,
     ReplaceFunctionallyEquivalentOpTargets,
-    ReplaceGeluWithApproximateGeluPass,
     ReplaceIm2RowWithViewPass,
     ReplaceLinearWithFullyConnectedOpPass,
     ReplaceMatmulWithTransposedMatmulPass,
@@ -1287,17 +1287,41 @@ def forward(self, cond: torch.Tensor):
             1,
         )
 
-    def test_replace_aten_gelu_with_approximate_gelu(self):
-        class Gelu(torch.nn.Module):
-            def forward(self, input):
-                return torch.nn.functional.gelu(input)
+    def test_no_replace_aten_gelu_with_approximate_gelu(self):
+        inputs = torch.randn(2, 1, 64)
+
+        gm = single_op_builder(
+            placeholders=(inputs,),
+            op=exir_ops.edge.aten.gelu.default,
+            args=(inputs,),
+        )
+        gm = ExportPass().call(gm).graph_module
+
+        p = ReplaceAtenApproxGeluWithApproxGeluPass()
+        graph_after_passes = p.call(gm).graph_module
 
+        # Assert that aten.gelu op was not decomposed, since it didn't have an approximate argument
+        self.assertEqual(
+            count_node(
+                graph_after_passes,
+                exir_ops.edge.aten.gelu.default,
+            ),
+            1,
+        )
+
+    def test_replace_aten_approximate_gelu_with_approximate_gelu(self):
         inputs = torch.randn(2, 1, 64)
 
-        graph_module = export_to_edge(Gelu(), (inputs,)).exported_program().graph_module
+        gm = single_op_builder(
+            placeholders=(inputs,),
+            op=exir_ops.edge.aten.gelu.default,
+            args=(inputs,),
+            kwargs={"approximate": "tanh"},
+        )
+        gm = ExportPass().call(gm).graph_module
 
-        p = ReplaceGeluWithApproximateGeluPass()
-        graph_after_passes = cast(PassResult, p(graph_module)).graph_module
+        p = ReplaceAtenApproxGeluWithApproxGeluPass()
+        graph_after_passes = p.call(gm).graph_module
 
         # Assert that aten.gelu op was decomposed
         self.assertEqual(
 
@@ -499,6 +499,8 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
     compute_graph->encode_prepack();
     compute_graph->prepack();
 
+    // TODO(ssjia): remove this once we can batch compile compute pipelines
+    // during prepare().
     compute_graph->encode_execute();
 
     return Error::Ok;
@@ -567,9 +569,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {
       }
     }
 
+    // propagate_resize() will re-encode the command buffer so that push
+    // constants are updated and DynamicDispatchNode can update the compute
+    // shader, global workgroup size, and local workgroup size to perform the
+    // model inference.
     if (should_propagate_resize) {
       compute_graph->propagate_resize();
     }
+
     compute_graph->execute();
 
     for (size_t i = 0; i < compute_graph->outputs().size(); i++) {
 
@@ -492,14 +492,24 @@ vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
     const ValueRef idx) {
   if (values_.at(idx).isInt()) {
     const int32_t val = extract_scalar<int32_t>(idx);
-    create_params_buffer(val);
+    return create_params_buffer(val);
   } else if (values_.at(idx).isSymInt()) {
     SymIntPtr symint = get_symint(idx);
     return vkapi::BufferBindInfo(symint->gpu_buffer.buffer());
   }
   VK_THROW("Cannot create a int param buffer for the given value");
 }
 
+vkapi::BufferBindInfo ComputeGraph::get_or_create_int_param_buffer(
+    const ValueRef idx,
+    const int32_t default_val) {
+  if (values_.at(idx).isNone()) {
+    return create_params_buffer(default_val);
+  } else {
+    return get_or_create_int_param_buffer(idx);
+  }
+}
+
 void ComputeGraph::set_symint(const ValueRef idx, const int32_t val) {
   get_symint(idx)->set(val);
 }
@@ -678,11 +688,12 @@ void ComputeGraph::encode_execute() {
   }
 }
 
-void ComputeGraph::execute() const {
+void ComputeGraph::execute() {
   vkapi::VulkanFence fence = context_->fences().get_fence();
   context_->submit_cmd_to_gpu(fence.get_submit_handle());
   fence.wait();
   context_->fences().return_fence(fence);
+  execute_count_++;
 }
 
 void ComputeGraph::resize_input(
@@ -692,10 +703,17 @@ void ComputeGraph::resize_input(
   get_tensor(io_val.value)->virtual_resize(new_sizes);
 }
 
+void ComputeGraph::virtual_resize(
+    const ValueRef idx,
+    const std::vector<int64_t>& new_sizes) {
+  get_tensor(idx)->virtual_resize(new_sizes);
+}
+
 void ComputeGraph::propagate_resize() {
   for (std::unique_ptr<ExecuteNode>& node : execute_nodes_) {
     node->trigger_resize(this);
   }
+  encode_execute();
 }
 
 } // namespace vkcompute
@@ -187,6 +187,7 @@ class ComputeGraph final {
 
  protected:
   size_t values_in_use_ = 0;
+  size_t execute_count_ = 0;
 
  public:
   //
@@ -397,6 +398,19 @@ class ComputeGraph final {
   std::optional<T> extract_optional_scalar(const ValueRef idx) {
     if (val_is_none(idx)) {
       return ::std::nullopt;
+    } else if (val_is_symint(idx)) {
+      return utils::safe_downcast<T>(read_symint(idx));
+    } else {
+      return extract_scalar<T>(idx);
+    }
+  }
+
+  template <typename T>
+  T extract_optional_scalar(const ValueRef idx, const T default_val) {
+    if (val_is_none(idx)) {
+      return default_val;
+    } else if (val_is_symint(idx)) {
+      return utils::safe_downcast<T>(read_symint(idx));
     } else {
       return extract_scalar<T>(idx);
     }
@@ -608,6 +622,10 @@ class ComputeGraph final {
    */
   vkapi::BufferBindInfo get_or_create_int_param_buffer(const ValueRef idx);
 
+  vkapi::BufferBindInfo get_or_create_int_param_buffer(
+      const ValueRef idx,
+      const int32_t default_value);
+
   void set_symint(const ValueRef idx, const int32_t val);
 
   int32_t read_symint(const ValueRef idx);
@@ -745,13 +763,16 @@ class ComputeGraph final {
   //
 
   void encode_execute();
-  void execute() const;
+  void execute();
 
   //
   // Dynamic Shape support
   //
 
   void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
+  void virtual_resize(
+      const ValueRef idx,
+      const std::vector<int64_t>& new_sizes);
   void propagate_resize();
 
   //
@@ -762,6 +783,10 @@ class ComputeGraph final {
     return context_->adapter_ptr()->supports_int16_shader_types();
   }
 
+  inline size_t execute_count() const {
+    return execute_count_;
+  }
+
   /*
    * Check whether the GPU supports 8 bit buffers.
    */
Original file line number	Diff line number	Diff line change
`@@ -499,6 +499,8 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {`
`499`	`499`	`compute_graph->encode_prepack();`
`500`	`500`	`compute_graph->prepack();`
`501`	`501`
	`502`	`+ // TODO(ssjia): remove this once we can batch compile compute pipelines`
	`503`	`+ // during prepare().`
`502`	`504`	`compute_graph->encode_execute();`
`503`	`505`
`504`	`506`	`return Error::Ok;`
`@@ -567,9 +569,14 @@ class VulkanBackend final : public ::executorch::runtime::BackendInterface {`
`567`	`569`	`}`
`568`	`570`	`}`
`569`	`571`
	`572`	`+ // propagate_resize() will re-encode the command buffer so that push`
	`573`	`+ // constants are updated and DynamicDispatchNode can update the compute`
	`574`	`+ // shader, global workgroup size, and local workgroup size to perform the`
	`575`	`+ // model inference.`
`570`	`576`	`if (should_propagate_resize) {`
`571`	`577`	`compute_graph->propagate_resize();`
`572`	`578`	`}`
	`579`	`+`
`573`	`580`	`compute_graph->execute();`
`574`	`581`
`575`	`582`	`for (size_t i = 0; i < compute_graph->outputs().size(); i++) {`