pytorch
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 2 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 2 deletions
diff --git a/‎.ci/scripts/unittest-buck2.sh
Lines changed: 6 additions & 2 deletions b/‎.ci/scripts/unittest-buck2.sh
Lines changed: 6 additions & 2 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.lintrunner.toml
Lines changed: 0 additions & 4 deletions b/‎.lintrunner.toml
Lines changed: 0 additions & 4 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 11 deletions b/‎CMakeLists.txt
Lines changed: 4 additions & 11 deletions
diff --git a/‎backends/arm/quantizer/arm_quantizer.py
Lines changed: 3 additions & 3 deletions b/‎backends/arm/quantizer/arm_quantizer.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/arm/quantizer/quantization_config.py
Lines changed: 26 additions & 14 deletions b/‎backends/arm/quantizer/quantization_config.py
Lines changed: 26 additions & 14 deletions
diff --git a/‎backends/arm/runtime/EthosUBackend.cpp
Lines changed: 10 additions & 42 deletions b/‎backends/arm/runtime/EthosUBackend.cpp
Lines changed: 10 additions & 42 deletions
diff --git a/‎backends/arm/test/models/test_llama.py
Lines changed: 3 additions & 1 deletion b/‎backends/arm/test/models/test_llama.py
Lines changed: 3 additions & 1 deletion
diff --git a/‎backends/cadence/fusion_g3/operators/op_clamp.cpp
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/fusion_g3/operators/op_clamp.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/fusion_g3/operators/op_dequantize.cpp
Lines changed: 10 additions & 10 deletions b/‎backends/cadence/fusion_g3/operators/op_dequantize.cpp
Lines changed: 10 additions & 10 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_div.cpp
Lines changed: 3 additions & 3 deletions b/‎backends/cadence/fusion_g3/operators/op_div.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/op_mean.cpp
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/fusion_g3/operators/op_mean.cpp
Lines changed: 1 addition & 1 deletion
@@ -156,8 +156,7 @@ cmake_install_executorch_libraries() {
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_QNN="$QNN" \
-        -DQNN_SDK_ROOT="$QNN_SDK_ROOT" \
-        -Bcmake-out .
+        -DQNN_SDK_ROOT="$QNN_SDK_ROOT"
     cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
 }
 
 
@@ -15,8 +15,10 @@ buck2 query "//backends/apple/... + //backends/example/... + \
 //kernels/optimized/... + //kernels/portable/... + //kernels/quantized/... + \
 //kernels/test/... + //runtime/... + //schema/... + //test/... + //util/..."
 
+# TODO: optimized ops are unbuildable because they now use ATen; put
+# them back after we can use PyTorch in OSS buck.
 UNBUILDABLE_OPTIMIZED_OPS_REGEX="_elu|gelu|fft|log_softmax"
-BUILDABLE_OPTIMIZED_OPS=$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
+BUILDABLE_OPTIMIZED_OPS= #$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
 
 # TODO: build prim_ops_test_cpp again once supported_features works in
 # OSS buck.
@@ -25,7 +27,9 @@ BUILDABLE_KERNELS_PRIM_OPS_TARGETS=$(buck2 query //kernels/prim_ops/... | grep -
 # //runtime/kernel/... is failing because //third-party:torchgen_files's shell script can't find python on PATH.
 # //runtime/test/... requires Python torch, which we don't have in our OSS buck setup.
 for op in "build" "test"; do
-    buck2 $op $BUILDABLE_OPTIMIZED_OPS //kernels/portable/... \
+    buck2 $op $BUILDABLE_OPTIMIZED_OPS \
+          //examples/selective_build:select_all_dtype_selective_lib_portable_lib \
+          //kernels/portable/... \
           $BUILDABLE_KERNELS_PRIM_OPS_TARGETS //runtime/backend/... //runtime/core/... \
           //runtime/executor: //runtime/kernel/... //runtime/platform/...
 done
@@ -262,7 +262,7 @@ jobs:
         output=$(ls -la ${elf})
         arr=($output)
         size=${arr[4]}
-        threshold="103068" # ~100KiB
+        threshold="103268" # ~100KiB
         echo "size: $size, threshold: $threshold"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
 
@@ -271,10 +271,6 @@ exclude_patterns = [
     'examples/**',
     'exir/verification/bindings.cpp',
     'extension/**',
-    # Uses properly-gated (ET_USE_PYTORCH_HEADERS) ATen include.
-    'kernels/portable/cpu/util/elementwise_util.h',
-    'kernels/portable/cpu/util/math_util.h',
-    'kernels/portable/cpu/util/vectorized_math.h',
     'kernels/optimized/**',
     'runtime/core/exec_aten/**',
     # Want to be able to keep c10 in sync with PyTorch core.
 
@@ -514,17 +514,6 @@ if(EXECUTORCH_BUILD_CORTEX_M)
 endif()
 
 if(EXECUTORCH_BUILD_DEVTOOLS)
-  if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
-    set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
-        ON
-        CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
-    )
-  else()
-    set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
-        OFF
-        CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
-    )
-  endif()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
 endif()
 
@@ -565,6 +554,10 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
+
+  # Add codegen tools subdirectory for selective_build pybind module
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/codegen/tools)
+
   if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
   endif()
 
@@ -247,9 +247,9 @@ def set_module_name(
         quantizer.set_module_name("blocks.sub"), it will quantize all supported operator/operator
         patterns in the submodule with this module name with the given `quantization_config`
         """
-        assert (
-            quantization_config is not None
-        ), " quantization_config == None is not supported yet"
+        # Validate that quantization_config is provided
+        if quantization_config is None:
+            raise ValueError("quantization_config == None is not supported yet")
         self.module_name_config[module_name] = quantization_config
         return self
 
 
@@ -29,30 +29,40 @@ def get_input_act_qspec(self) -> QuantizationSpec | None:
         """Returns QuantizationSpec 'input_activation' after asserting that input_activation.qscheme is valid."""
         if self.input_activation is None:
             return None
-        assert self.input_activation.qscheme in [
+        # Validate that input_activation uses a supported qscheme
+        if self.input_activation.qscheme not in [
             torch.per_tensor_affine,
             torch.per_tensor_symmetric,
-        ], f"Unsupported quantization_spec {self.input_activation} for input_activation."
+        ]:
+            raise ValueError(
+                f"Unsupported quantization_spec {self.input_activation} for input_activation."
+            )
         return self.input_activation
 
     def get_output_act_qspec(self) -> QuantizationSpec | None:
         """Returns QuantizationSpec 'output_activation' after asserting that output_activation.qscheme is valid."""
         if self.output_activation is None:
             return None
-        assert self.output_activation.qscheme in [
+        # Validate that output_activation uses a supported qscheme
+        if self.output_activation.qscheme not in [
             torch.per_tensor_affine,
             torch.per_tensor_symmetric,
-        ], f"Unsupported quantization_spec {self.output_activation} for output_activation."
+        ]:
+            raise ValueError(
+                f"Unsupported quantization_spec {self.output_activation} for output_activation."
+            )
         return self.output_activation
 
     def get_weight_qspec(self) -> QuantizationSpec | None:
         """Returns QuantizationSpec 'weight' after asserting that weight.qscheme is valid."""
         if self.weight is None:
             return None
-        assert self.weight.qscheme in [
+        # Validate that weight uses a supported qscheme
+        if self.weight.qscheme not in [
             torch.per_tensor_symmetric,
             torch.per_channel_symmetric,
-        ], f"Unsupported quantization_spec {self.weight} for weight"
+        ]:
+            raise ValueError(f"Unsupported quantization_spec {self.weight} for weight")
         return self.weight
 
     def get_bias_qspec(self, node: torch.fx.Node) -> QuantizationSpec | None:
@@ -61,11 +71,11 @@ def get_bias_qspec(self, node: torch.fx.Node) -> QuantizationSpec | None:
         def _derive_qparams_fn(
             obs_or_fqs: list[ObserverOrFakeQuantize],
         ) -> tuple[torch.Tensor, torch.Tensor]:
-            assert (
-                len(obs_or_fqs) == 2
-            ), "Expecting two obs/fqs, one for activation and one for weight, got: {}".format(
-                len(obs_or_fqs)
-            )
+            # Validate expected number of observers/fake-quantizes
+            if len(obs_or_fqs) != 2:
+                raise ValueError(
+                    f"Expecting two obs/fqs, one for activation and one for weight, got: {len(obs_or_fqs)}"
+                )
             act_obs_or_fq = obs_or_fqs[0]
             weight_obs_or_fq = obs_or_fqs[1]
             act_scale, act_zp = act_obs_or_fq.calculate_qparams()
@@ -94,9 +104,11 @@ def _derive_qparams_fn(
 
         if self.bias is None:
             return None
-        assert (
-            self.bias.dtype == torch.float
-        ), "Only float dtype for bias is supported for bias right now"
+        # Validate that bias dtype is floating-point
+        if self.bias.dtype != torch.float:
+            raise ValueError(
+                "Only float dtype for bias is supported for bias right now"
+            )
         return self.bias
 
     def get_fixed_qspec(
 
@@ -261,24 +261,12 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
             event_tracer,
             "+EthosUBackend::execute()handles.input.permute_CHW_to_HWC()");
         // permuted byte copy CHW to HWC
-        int c, h, w;
-        if (tensor_in.dim() == 4) {
-          c = tensor_in.size(1);
-          h = tensor_in.size(2);
-          w = tensor_in.size(3);
-        } else if (tensor_in.dim() == 5) {
-          c = tensor_in.size(2);
-          h = tensor_in.size(3);
-          w = tensor_in.size(4);
-        } else {
-          ET_LOG(
-              Error,
-              "Unsupported input tensor dimension %d, expected 4 or 5",
-              tensor_in.dim());
-          return Error::InvalidProgram;
-        }
         permute_CHW_to_HWC(
-            tensor_in.mutable_data_ptr<char>(), scratch_addr, c, h, w);
+            tensor_in.mutable_data_ptr<char>(),
+            scratch_addr,
+            tensor_in.size(1),
+            tensor_in.size(2),
+            tensor_in.size(3));
       } else if (both_char or both_int or both_short) {
         EXECUTORCH_PROF_SCOPE(
             event_tracer, "+EthosUBackend::execute()handles.input.memcpy()");
@@ -376,24 +364,12 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
             "+EthosUBackend::execute()handles.output.permute_HWC_to_CHW()");
 
         char* output_address = (char*)output_addr;
-        int c, h, w;
-        if (tensor_out.dim() == 4) {
-          c = tensor_out.size(1);
-          h = tensor_out.size(2);
-          w = tensor_out.size(3);
-        } else if (tensor_out.dim() == 5) {
-          c = tensor_out.size(2);
-          h = tensor_out.size(3);
-          w = tensor_out.size(4);
-        } else {
-          ET_LOG(
-              Error,
-              "Unsupported output tensor dimension %d, expected 4 or 5",
-              tensor_out.dim());
-          return Error::InvalidProgram;
-        }
         permute_HWC_to_CHW(
-            output_address, tensor_out.mutable_data_ptr<char>(), c, h, w);
+            output_address,
+            tensor_out.mutable_data_ptr<char>(),
+            tensor_out.size(1),
+            tensor_out.size(2),
+            tensor_out.size(3));
       } else {
         EXECUTORCH_PROF_SCOPE(
             event_tracer, "+EthosUBackend::execute()handles.output.move()");
@@ -454,14 +430,6 @@ class EthosUBackend final : public ::executorch::runtime::BackendInterface {
       if (permuted_shape) {
         ET_LOG(Debug, "Tensor input/output %d will be permuted", index);
       }
-    } else if (tensor.dim() == 5) {
-      // Same as above, but for 5D tensors.
-      permuted_shape = tensor.size(0) == io->shape[0] &&
-          tensor.size(1) == io->shape[1] && tensor.size(2) == io->shape[4] &&
-          tensor.size(3) == io->shape[2] && tensor.size(4) == io->shape[3];
-      if (permuted_shape) {
-        ET_LOG(Debug, "Tensor input/output %d will be permuted", index);
-      }
     }
     *is_permuted = permuted_shape;
     return Error::Ok;
 
@@ -22,6 +22,7 @@
     TosaPipelineMI,
 )
 
+from executorch.examples.models.llama.config.llm_config import LlmConfig
 from executorch.examples.models.llama.export_llama_lib import (
     build_args_parser,
     get_llama_model,
@@ -89,8 +90,9 @@ def prepare_model(self):
         ]
         parser = build_args_parser()
         args = parser.parse_args(args)
+        llm_config = LlmConfig.from_args(args)
 
-        llama_model, llama_inputs, llama_meta = get_llama_model(args)
+        llama_model, llama_inputs, llama_meta = get_llama_model(llm_config)
 
         return llama_model, llama_inputs, llama_meta
 
 
@@ -21,13 +21,13 @@
 #include <executorch/kernels/portable/cpu/util/math_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
-using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::canCast;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 namespace cadence {
 namespace impl {
 
@@ -24,7 +24,7 @@ using ::executorch::runtime::Error;
 using ::executorch::runtime::KernelRuntimeContext;
 
 template <typename T>
-using optional = ::executorch::aten::optional<T>;
+using optional = std::optional<T>;
 /* ScalarType in Executorch do not have support for below data types.
  * So, creating a placeholder for these data types. Once, ScalarTypes is
  * updated to have support for below data types, these can be removed and
@@ -51,7 +51,7 @@ void check_dequantize_per_tensor_args(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    ::executorch::aten::optional<ScalarType>& out_dtype,
+    std::optional<ScalarType>& out_dtype,
     Tensor& out) {
   ET_CHECK_MSG(
       input.scalar_type() == ScalarType::Byte ||
@@ -93,7 +93,7 @@ Tensor& dequantize_impl(
     float* scale_data,
     int* zero_point_data,
     int* axis,
-    ::executorch::aten::optional<ScalarType> out_dtype) {
+    std::optional<ScalarType> out_dtype) {
   const ::executorch::aten::ArrayRef<Tensor::SizesType> input_size =
       input.sizes();
 
@@ -260,8 +260,8 @@ Tensor& dequantize_impl(
           }
         }
 
-        ::executorch::aten::optional<::executorch::aten::ArrayRef<int64_t>>
-            optional_dim_list{::executorch::aten::ArrayRef<int64_t>{
+        std::optional<::executorch::aten::ArrayRef<int64_t>> optional_dim_list{
+            ::executorch::aten::ArrayRef<int64_t>{
                 dims, size_t(input.dim() - 1)}};
 
 // Actual dequantization logic
@@ -466,8 +466,8 @@ Tensor& dequantize_impl(
           }
         }
 
-        ::executorch::aten::optional<::executorch::aten::ArrayRef<int64_t>>
-            optional_dim_list{::executorch::aten::ArrayRef<int64_t>{
+        std::optional<::executorch::aten::ArrayRef<int64_t>> optional_dim_list{
+            ::executorch::aten::ArrayRef<int64_t>{
                 dims, size_t(input.dim() - 1)}};
 
 // Actual dequantization logic
@@ -600,7 +600,7 @@ Tensor& dequantize_per_tensor_tensor_args_out(
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    ::executorch::aten::optional<ScalarType> out_dtype,
+    std::optional<ScalarType> out_dtype,
     Tensor& out) {
 #ifdef OP_ARG_CHECK
   ET_CHECK_MSG(
@@ -639,12 +639,12 @@ Tensor& dequantize_per_channel_out(
     KernelRuntimeContext& context,
     const Tensor& input,
     const Tensor& scale,
-    const ::executorch::aten::optional<Tensor>& opt_zero_points,
+    const std::optional<Tensor>& opt_zero_points,
     int64_t axis,
     int64_t quant_min,
     int64_t quant_max,
     ScalarType dtype,
-    ::executorch::aten::optional<ScalarType> out_dtype,
+    std::optional<ScalarType> out_dtype,
     Tensor& out) {
   if (axis < 0) {
     axis += executorch::runtime::nonzero_dim(input);
 
@@ -19,14 +19,14 @@
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
-using ::executorch::aten::optional;
 using ::executorch::aten::Scalar;
 using ::executorch::aten::ScalarType;
-using ::executorch::aten::string_view;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::canCast;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::KernelRuntimeContext;
+using std::optional;
+using std::string_view;
 
 namespace cadence {
 namespace impl {
@@ -686,4 +686,4 @@ Tensor& div_scalar_mode_out(
 } // namespace native
 } // namespace G3
 } // namespace impl
-} // namespace cadence
+} // namespace cadence
@@ -17,11 +17,11 @@
 #include <executorch/runtime/platform/assert.h>
 
 using ::executorch::aten::ArrayRef;
-using ::executorch::aten::optional;
 using ::executorch::aten::ScalarType;
 using ::executorch::aten::Tensor;
 using ::executorch::runtime::Error;
 using ::executorch::runtime::KernelRuntimeContext;
+using std::optional;
 
 namespace cadence {
 namespace impl {
Original file line number	Diff line number	Diff line change
`@@ -156,8 +156,7 @@ cmake_install_executorch_libraries() {`
`156`	`156`	`-DCMAKE_INSTALL_PREFIX=cmake-out \`
`157`	`157`	`-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \`
`158`	`158`	`-DEXECUTORCH_BUILD_QNN="$QNN" \`
`159`		`- -DQNN_SDK_ROOT="$QNN_SDK_ROOT" \`
`160`		`- -Bcmake-out .`
	`159`	`+ -DQNN_SDK_ROOT="$QNN_SDK_ROOT"`
`161`	`160`	`cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"`
`162`	`161`	`}`
`163`	`162`
Original file line number	Diff line number	Diff line change
`@@ -22,6 +22,7 @@`
`22`	`22`	`TosaPipelineMI,`
`23`	`23`	`)`
`24`	`24`
	`25`	`+from executorch.examples.models.llama.config.llm_config import LlmConfig`
`25`	`26`	`from executorch.examples.models.llama.export_llama_lib import (`
`26`	`27`	`build_args_parser,`
`27`	`28`	`get_llama_model,`
`@@ -89,8 +90,9 @@ def prepare_model(self):`
`89`	`90`	`]`
`90`	`91`	`parser = build_args_parser()`
`91`	`92`	`args = parser.parse_args(args)`
	`93`	`+ llm_config = LlmConfig.from_args(args)`
`92`	`94`
`93`		`- llama_model, llama_inputs, llama_meta = get_llama_model(args)`
	`95`	`+ llama_model, llama_inputs, llama_meta = get_llama_model(llm_config)`
`94`	`96`
`95`	`97`	`return llama_model, llama_inputs, llama_meta`
`96`	`98`