pytorch
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 17 additions & 6 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 17 additions & 6 deletions
diff --git a/‎.ci/scripts/test_quantized_aot_lib.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_quantized_aot_lib.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/android.yml
Lines changed: 3 additions & 2 deletions b/‎.github/workflows/android.yml
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/doc-build.yml
Lines changed: 7 additions & 0 deletions b/‎.github/workflows/doc-build.yml
Lines changed: 7 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 2 additions & 11 deletions b/‎CMakeLists.txt
Lines changed: 2 additions & 11 deletions
diff --git a/‎backends/vulkan/runtime/api/Context.h
Lines changed: 3 additions & 3 deletions b/‎backends/vulkan/runtime/api/Context.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/vulkan/runtime/api/Pipeline.cpp
Lines changed: 2 additions & 4 deletions b/‎backends/vulkan/runtime/api/Pipeline.cpp
Lines changed: 2 additions & 4 deletions
diff --git a/‎backends/vulkan/runtime/api/Pipeline.h
Lines changed: 7 additions & 2 deletions b/‎backends/vulkan/runtime/api/Pipeline.h
Lines changed: 7 additions & 2 deletions
diff --git a/‎backends/vulkan/test/utils/test_utils.cpp
Lines changed: 9 additions & 4 deletions b/‎backends/vulkan/test/utils/test_utils.cpp
Lines changed: 9 additions & 4 deletions
diff --git a/‎backends/vulkan/test/vulkan_compute_api_test.cpp
Lines changed: 6 additions & 3 deletions b/‎backends/vulkan/test/vulkan_compute_api_test.cpp
Lines changed: 6 additions & 3 deletions
diff --git a/‎build/Utils.cmake
Lines changed: 0 additions & 2 deletions b/‎build/Utils.cmake
Lines changed: 0 additions & 2 deletions
diff --git a/‎build/build_apple_frameworks.sh
Lines changed: 2 additions & 5 deletions b/‎build/build_apple_frameworks.sh
Lines changed: 2 additions & 5 deletions
diff --git a/‎build/executorch-config.cmake
Lines changed: 1 addition & 1 deletion b/‎build/executorch-config.cmake
Lines changed: 1 addition & 1 deletion
@@ -12,7 +12,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 MODEL_NAME=$1 # stories110M.pt
 BUILD_TOOL=$2 # buck2 or cmake
 DTYPE=$3 # fp16 or fp32
-MODE=${4:-"xnnpack"} # portable or xnnpack
+MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -37,7 +37,7 @@ if [[ -z "${MODE:-}" ]]; then
   exit 1
 fi
 
-if [[ "${MODE}" =~ xnnpack.* ]]; then
+if [[ "${MODE}" =~ .*xnnpack.* ]]; then
   XNNPACK=ON
 else
   XNNPACK=OFF
@@ -49,6 +49,12 @@ else
   CUSTOM=OFF
 fi
 
+if [[ "${MODE}" =~ .*qe.* ]]; then
+  QE=ON
+else
+  QE=OFF
+fi
+
 if [[ -z "${BUCK:-}" ]]; then
   BUCK=buck2
 fi
@@ -84,7 +90,6 @@ cmake_build_llama_runner() {
         -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-        -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out/${dir} \
         ${dir}
@@ -126,9 +131,15 @@ fi
 # Export model.
 EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
 echo "Exporting ${EXPORTED_MODEL_NAME}"
-EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}"
-if [[ "${MODE}" == "xnnpack+kv+custom" ]]; then
-  EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128"
+EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
+if [[ "${XNNPACK}" == "ON" ]]; then
+  EXPORT_ARGS="${EXPORT_ARGS} -X -qmode 8da4w -G 128"
+fi
+if [[ "${CUSTOM}" == "ON" ]]; then
+  EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
+fi
+if [[ "${QE}" == "ON" ]]; then
+  EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
 fi
 # Add dynamically linked library location
 $PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
 
@@ -24,7 +24,7 @@ build_cmake_quantized_aot_lib() {
     && retry cmake -DBUCK2=buck2 \
       -DCMAKE_BUILD_TYPE=Release \
       -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-      -DEXECUTORCH_BUILD_QUANTIZED=ON \
+      -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4
 
@@ -10,7 +10,8 @@ on:
       - .ci/docker/**
       - .github/workflows/android.yml
       - install_requirements.sh
-      - examples/demo-apps/**
+      - examples/demo-apps/android/**
+      - extension/android/**
       - extension/module/**
   workflow_dispatch:
 
@@ -101,7 +102,7 @@ jobs:
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/app-debug-androidTest.apk
       # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
-      test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/414cb54d-4d83-4576-8317-93244e4dc50e
+      test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
       # The exported llama2 model and its tokenizer, can be downloaded from https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b.zip.
       # Among the input, this is the biggest file and uploading it to AWS beforehand makes the test run much faster
       extra-data: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/bd15825b-ddab-4e47-9fef-a9c8935778dd
@@ -68,6 +68,13 @@ jobs:
         make html
         cd ..
 
+        # If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
+        GITHUB_REF=${{ github.ref }}
+        echo "GitHub Ref: ${GITHUB_REF}"
+        if [[ "${{ github.ref }}" == 'refs/heads/main' ]]; then
+          find docs/_build/html/ -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">';
+        fi
+
         cp -rf docs/_build/html/* "${RUNNER_DOCS_DIR}"
 
         mv docs/_build/html "${RUNNER_ARTIFACT_DIR}"
 
@@ -90,7 +90,7 @@ jobs:
       matrix:
         dtype: [fp32]
         build-tool: [buck2, cmake]
-        mode: [portable, xnnpack+kv+custom]
+        mode: [portable, xnnpack+custom, xnnpack+custom+qe]
       fail-fast: false
     with:
       runner: linux.2xlarge
 
@@ -164,8 +164,6 @@ option(EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" OFF)
 
 option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF)
 
-option(EXECUTORCH_BUILD_QUANTIZED "Build the quantized kernels" OFF)
-
 option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK")
 
 option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF)
@@ -413,9 +411,7 @@ if(EXECUTORCH_BUILD_OPTIMIZED)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
-if(EXECUTORCH_BUILD_QUANTIZED)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
-endif()
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
 
@@ -445,19 +441,14 @@ cmake_dependent_option(
   EXECUTORCH_BUILD_HOST_TARGETS OFF)
 if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   # Baseline libraries that executor_runner will link against.
-  set(_executor_runner_libs executorch gflags)
+  set(_executor_runner_libs executorch gflags quantized_ops_lib)
 
   if(EXECUTORCH_BUILD_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   else()
     list(APPEND _executor_runner_libs portable_ops_lib)
   endif()
 
-  # Generate lib to register quantized ops
-  if(EXECUTORCH_BUILD_QUANTIZED)
-    list(APPEND _executor_runner_libs quantized_ops_lib)
-  endif()
-
   add_executable(executor_runner ${_executor_runner__srcs})
   if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE)
     target_link_options(executor_runner PRIVATE "LINKER:--gc-sections")
 
@@ -495,7 +495,7 @@ inline bool Context::submit_compute_job(
     PipelineBarrier& pipeline_barrier,
     const utils::uvec3& global_work_group,
     const utils::uvec3& local_work_group_size,
-    const SpecVarList& specialization,
+    const SpecVarList& specialization_constants,
     VkFence fence_handle,
     Arguments&&... arguments) {
   // If any of the provided arguments does not have memory associated with it,
@@ -538,8 +538,8 @@ inline bool Context::submit_compute_job(
 #endif /* USE_VULKAN_GPU_DIAGNOSTICS */
 
   // Factor out template parameter independent code to minimize code bloat.
-  DescriptorSet descriptor_set =
-      get_descriptor_set(shader, local_work_group_size, specialization);
+  DescriptorSet descriptor_set = get_descriptor_set(
+      shader, local_work_group_size, specialization_constants);
 
   detail::bind(
       descriptor_set,
 
@@ -157,9 +157,7 @@ bool operator==(const SpecVar& lhs, const SpecVar& rhs) {
   return false;
 }
 
-SpecVarList::SpecVarList() {
-  vars.reserve(8);
-}
+SpecVarList::SpecVarList() {}
 
 SpecVarList::SpecVarList(std::initializer_list<SpecVar> init_list) {
   vars.resize(init_list.size());
@@ -176,7 +174,7 @@ std::vector<VkSpecializationMapEntry> SpecVarList::generate_map_entries()
   map_entries.resize(vars.size());
   uint32_t cur_offset = 0u;
   for (uint32_t i = 0; i < vars.size(); ++i) {
-    map_entries[i] = {
+    map_entries.at(i) = {
         i, cur_offset + vars.at(i).val_offset(), vars.at(i).val_size()};
     cur_offset += sizeof(SpecVar);
   }
 
@@ -53,12 +53,17 @@ struct SpecVar final {
 
 bool operator==(const SpecVar& lhs, const SpecVar& rhs);
 
-struct SpecVarList final {
+class SpecVarList final {
   std::vector<SpecVar> vars;
 
+ public:
   SpecVarList();
   SpecVarList(std::initializer_list<SpecVar> init_list);
 
+  inline const SpecVar& at(const size_t index) const {
+    return vars.at(index);
+  }
+
   inline const SpecVar* data() const {
     return vars.data();
   }
@@ -235,7 +240,7 @@ class ComputePipelineCache final {
       seed = utils::hash_combine(seed, std::hash<uint32_t>()(spec_vars.size()));
 
       for (int i = 0; i < spec_vars.size(); ++i) {
-        const SpecVar& spec_var = spec_vars.vars.at(i);
+        const SpecVar& spec_var = spec_vars.at(i);
         size_t new_seed = 0;
         switch (spec_var.type) {
           case SpecVar::Type::FLOAT:
 
@@ -21,13 +21,14 @@ void record_nchw_to_image_op(
     api::VulkanBuffer& src_buffer,
     vTensor& v_dst) {
   api::PipelineBarrier pipeline_barrier{};
+  api::SpecVarList specialization_constants = {};
 
   context->submit_compute_job(
       get_nchw_to_image_shader(v_dst),
       pipeline_barrier,
       v_dst.virtual_extents(),
       adaptive_work_group_size(v_dst.virtual_extents()),
-      {},
+      specialization_constants,
       VK_NULL_HANDLE,
       v_dst.image(
           pipeline_barrier,
@@ -43,12 +44,14 @@ void record_image_to_nchw_op(
     vTensor& v_src,
     api::VulkanBuffer& dst_buffer) {
   api::PipelineBarrier pipeline_barrier{};
+  api::SpecVarList specialization_constants = {};
+
   context->submit_compute_job(
       get_image_to_nchw_shader(v_src),
       pipeline_barrier,
       v_src.virtual_extents(),
       adaptive_work_group_size(v_src.virtual_extents()),
-      {},
+      specialization_constants,
       VK_NULL_HANDLE,
       v_src.image(pipeline_barrier, api::PipelineStage::COMPUTE),
       dst_buffer,
@@ -80,12 +83,13 @@ void record_conv2d_prepack_weights_op(
   api::UniformParamsBuffer padded_sizes_ubo(
       context, api::utils::make_ivec2(padded_sizes, /*reverse = */ true));
 
+  api::SpecVarList specialization_constants = {};
   context->submit_compute_job(
       shader,
       pipeline_barrier,
       v_dst.virtual_extents(),
       adaptive_work_group_size(v_dst.virtual_extents()),
-      {},
+      specialization_constants,
       VK_NULL_HANDLE,
       v_dst.image(
           pipeline_barrier,
@@ -107,12 +111,13 @@ void record_binary_op(
   add_dtype_suffix(kernel_name, v_dst);
 
   api::PipelineBarrier pipeline_barrier{};
+  api::SpecVarList specialization_constants = {};
   context->submit_compute_job(
       VK_KERNEL_FROM_STR(kernel_name),
       pipeline_barrier,
       v_dst.virtual_extents(),
       adaptive_work_group_size(v_dst.virtual_extents()),
-      {},
+      specialization_constants,
       VK_NULL_HANDLE,
       v_dst.image(
           pipeline_barrier,
 
@@ -153,12 +153,13 @@ TEST_F(VulkanComputeAPITest, update_params_between_submit) {
 
   {
     api::PipelineBarrier pipeline_barrier{};
+    api::SpecVarList specialization_constants = {};
     api::context()->submit_compute_job(
         VK_KERNEL_FROM_STR(kernel_name),
         pipeline_barrier,
         {4, 4, 4},
         {4, 4, 4},
-        {},
+        specialization_constants,
         VK_NULL_HANDLE,
         a.image(
             pipeline_barrier,
@@ -213,12 +214,13 @@ void test_storage_buffer_type(const size_t len) {
   {
     uint32_t len_div4 = api::utils::div_up(uint32_t(len), uint32_t(4));
     api::PipelineBarrier pipeline_barrier{};
+    api::SpecVarList specialization_constants = {};
     api::context()->submit_compute_job(
         VK_KERNEL_FROM_STR(kernel_name),
         pipeline_barrier,
         {64, 1, 1},
         {len_div4, 1, 1},
-        {},
+        specialization_constants,
         VK_NULL_HANDLE,
         buffer.buffer(),
         params.buffer());
@@ -909,12 +911,13 @@ void run_from_gpu_test(
 
   {
     api::PipelineBarrier pipeline_barrier{};
+    api::SpecVarList specialization_constants = {};
     api::context()->submit_compute_job(
         VK_KERNEL_FROM_STR(kernel_name),
         pipeline_barrier,
         vten.virtual_extents(),
         {4, 4, 4},
-        {},
+        specialization_constants,
         VK_NULL_HANDLE,
         vten.image(
             pipeline_barrier,
 
@@ -74,8 +74,6 @@ function(executorch_print_configuration_summary)
     STATUS "  EXECUTORCH_BUILD_QNN                   : ${EXECUTORCH_BUILD_QNN}")
   message(STATUS "  EXECUTORCH_BUILD_OPTIMIZED             : "
                  "${EXECUTORCH_BUILD_OPTIMIZED}")
-  message(STATUS "  EXECUTORCH_BUILD_QUANTIZED             : "
-                 "${EXECUTORCH_BUILD_QUANTIZED}")
   message(
     STATUS "  EXECUTORCH_BUILD_SDK                   : ${EXECUTORCH_BUILD_SDK}")
   message(
 
@@ -22,7 +22,7 @@ CUSTOM=OFF
 MPS=OFF
 OPTIMIZED=OFF
 PORTABLE=OFF
-QUANTIZED=OFF
+QUANTIZED=ON
 XNNPACK=OFF
 HEADERS_PATH="include"
 EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libexecutorch_no_prim_ops.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
@@ -51,7 +51,6 @@ usage() {
   echo "  --mps                Include this flag to build the Metal Performance Shaders backend."
   echo "  --optimized          Include this flag to build the Optimized backend."
   echo "  --portable           Include this flag to build the Portable backend."
-  echo "  --quantized          Include this flag to build the Quantized backend."
   echo "  --xnnpack            Include this flag to build the XNNPACK backend."
   echo
   echo "Example:"
@@ -74,7 +73,6 @@ for arg in "$@"; do
       --mps) MPS=ON ;;
       --optimized) OPTIMIZED=ON ;;
       --portable) PORTABLE=ON ;;
-      --quantized) QUANTIZED=ON ;;
       --xnnpack) XNNPACK=ON ;;
       *)
       if [[ -z "$SOURCE_ROOT_DIR" ]]; then
@@ -137,7 +135,6 @@ cmake_build() {
         -DEXECUTORCH_BUILD_CUSTOM=$CUSTOM \
         -DEXECUTORCH_BUILD_MPS=$MPS \
         -DEXECUTORCH_BUILD_OPTIMIZED=$OPTIMIZED \
-        -DEXECUTORCH_BUILD_QUANTIZED=$QUANTIZED \
         -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \
         ${platform_flag:+-DIOS_PLATFORM=$platform_flag}
     cmake --build . --config $MODE
@@ -181,7 +178,7 @@ append_framework_flag "$CUSTOM" "$CUSTOM_FRAMEWORK"
 append_framework_flag "$MPS" "$MPS_FRAMEWORK"
 append_framework_flag "$OPTIMIZED" "$OPTIMIZED_FRAMEWORK"
 append_framework_flag "$PORTABLE" "$PORTABLE_FRAMEWORK"
-append_framework_flag "$QUANTIZED" "$QUANTIZED_FRAMEWORK"
+append_framework_flag "ON" "$QUANTIZED_FRAMEWORK"
 append_framework_flag "$XNNPACK" "$XNNPACK_FRAMEWORK"
 
 "$SOURCE_ROOT_DIR"/build/create_frameworks.sh "${FRAMEWORK_FLAGS[@]}"
 
@@ -38,7 +38,7 @@ set(lib_list
     etdump bundled_program extension_data_loader ${FLATCCRT_LIB} mpsdelegate
     qnn_executorch_backend portable_ops_lib extension_module xnnpack_backend
     XNNPACK cpuinfo pthreadpool vulkan_backend optimized_kernels cpublas eigen_blas
-    optimized_ops_lib optimized_native_cpu_ops_lib
+    optimized_ops_lib optimized_native_cpu_ops_lib quantized_kernels quantized_ops_lib
 )
 foreach(lib ${lib_list})
     # Name of the variable which stores result of the find_library search
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ set(lib_list`
`38`	`38`	`etdump bundled_program extension_data_loader ${FLATCCRT_LIB} mpsdelegate`
`39`	`39`	`qnn_executorch_backend portable_ops_lib extension_module xnnpack_backend`
`40`	`40`	`XNNPACK cpuinfo pthreadpool vulkan_backend optimized_kernels cpublas eigen_blas`
`41`		`- optimized_ops_lib optimized_native_cpu_ops_lib`
	`41`	`+ optimized_ops_lib optimized_native_cpu_ops_lib quantized_kernels quantized_ops_lib`
`42`	`42`	`)`
`43`	`43`	`foreach(lib ${lib_list})`
`44`	`44`	`# Name of the variable which stores result of the find_library search`