pytorch
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 22 additions & 11 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 22 additions & 11 deletions
diff --git a/‎.github/workflows/_unittest.yml
Lines changed: 0 additions & 5 deletions b/‎.github/workflows/_unittest.yml
Lines changed: 0 additions & 5 deletions
diff --git a/‎.github/workflows/apple.yml
Lines changed: 0 additions & 8 deletions b/‎.github/workflows/apple.yml
Lines changed: 0 additions & 8 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 23 deletions b/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 23 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 31 additions & 24 deletions b/‎CMakeLists.txt
Lines changed: 31 additions & 24 deletions
diff --git a/‎backends/qualcomm/builders/op_linear.py
Lines changed: 14 additions & 0 deletions b/‎backends/qualcomm/builders/op_linear.py
Lines changed: 14 additions & 0 deletions
@@ -37,6 +37,18 @@ if [[ -z "${MODE:-}" ]]; then
   exit 1
 fi
 
+if [[ "${MODE}" =~ xnnpack.* ]]; then
+  XNNPACK=ON
+else
+  XNNPACK=OFF
+fi
+
+if [[ "${MODE}" =~ .*custom.* ]]; then
+  CUSTOM=ON
+else
+  CUSTOM=OFF
+fi
+
 if [[ -z "${BUCK:-}" ]]; then
   BUCK=buck2
 fi
@@ -47,38 +59,36 @@ fi
 
 which "${PYTHON_EXECUTABLE}"
 
-
 cmake_install_executorch_libraries() {
     echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
     rm -rf cmake-out
-    if [[ "${MODE}" == "xnnpack" ]]; then
-      XNNPACK=ON
-    else
-      XNNPACK=OFF
-    fi
     retry cmake -DBUCK2="$BUCK" \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=Debug \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out .
-    cmake --build cmake-out -j9 --target install --config Release
+    cmake --build cmake-out -j9 --target install --config Debug
 }
 
 cmake_build_llama_runner() {
     echo "Building llama runner"
     dir="examples/models/llama2"
     retry cmake -DBUCK2="$BUCK" \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_BUILD_TYPE=Release \
+        -DCMAKE_BUILD_TYPE=Debug \
+        -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
+        -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+        -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out/${dir} \
         ${dir}
-    cmake --build cmake-out/${dir} -j9 --config Release
+    cmake --build cmake-out/${dir} -j9 --config Debug
 
 }
 
@@ -117,9 +127,10 @@ fi
 EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
 echo "Exporting ${EXPORTED_MODEL_NAME}"
 EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}"
-if [[ "${MODE}" == "xnnpack" ]]; then
+if [[ "${MODE}" == "xnnpack+kv+custom" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128"
 fi
+# Add dynamically linked library location
 $PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
 
 # Create tokenizer.bin.
 
@@ -57,9 +57,6 @@ jobs:
       script: |
         set -eux
 
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-
         BUILD_TOOL=${{ matrix.build-tool }}
 
         bash .ci/scripts/setup-conda.sh
@@ -75,5 +72,3 @@ jobs:
         ${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml
         # Run gtest
         ${CONDA_RUN} buck2 test runtime/core/... runtime/platform/...
-
-        popd
@@ -34,8 +34,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
         BUILD_TOOL=cmake
 
         .ci/scripts/setup-conda.sh
@@ -48,8 +46,6 @@ jobs:
         PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
         build/test_ios_ci.sh
 
-        popd
-
   build-frameworks-ios:
     name: build-frameworks-ios
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -61,8 +57,6 @@ jobs:
       upload-artifact: executorch-frameworks-ios
       timeout: 90
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
         BUILD_TOOL=cmake
         VERSION="0.1.0"
         FRAMEWORKS=(
@@ -111,8 +105,6 @@ jobs:
           zip -r "${RUNNER_TEMP}/artifacts/${FRAMEWORK}_debug-${VERSION}.zip" "${FRAMEWORK}_debug.xcframework"
         ) done
 
-        popd
-
   upload-frameworks-ios:
     runs-on: ubuntu-22.04
     needs: build-frameworks-ios
 
@@ -90,7 +90,7 @@ jobs:
       matrix:
         dtype: [fp32]
         build-tool: [buck2, cmake]
-        mode: [portable, xnnpack]
+        mode: [portable, xnnpack+kv+custom]
       fail-fast: false
     with:
       runner: linux.2xlarge
 
@@ -46,9 +46,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: ${{ matrix.timeout }}
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-
         MODEL_NAME=${{ matrix.model }}
         BUILD_TOOL=${{ matrix.build-tool }}
         BACKEND=${{ matrix.backend }}
@@ -59,7 +56,6 @@ jobs:
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test xecutorch
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test.sh "${MODEL_NAME}" "${BUILD_TOOL}" "${BACKEND}" "${DEMO_BACKEND_DELEGATION}"
-        popd
 
   test-custom-ops-macos:
     name: test-custom-ops-macos
@@ -75,17 +71,13 @@ jobs:
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-
         BUILD_TOOL=${{ matrix.build-tool }}
 
         bash .ci/scripts/setup-conda.sh
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test custom ops
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/portable/custom_ops/test_custom_ops.sh "${BUILD_TOOL}"
-        popd
 
   test-selective-build-macos:
     name: test-selective-build-macos
@@ -101,17 +93,13 @@ jobs:
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-
         BUILD_TOOL=${{ matrix.build-tool }}
 
         bash .ci/scripts/setup-conda.sh
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test selective build
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
-        popd
 
   test-demo-backend-delegation:
     name: test-demo-backend-delegation
@@ -208,17 +196,13 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
-
         BUILD_TOOL=cmake
 
         bash .ci/scripts/setup-conda.sh
         # Setup MacOS dependencies as there is no Docker support on MacOS atm
         GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh "${BUILD_TOOL}"
         # Build and test coreml delegate
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash backends/apple/coreml/scripts/build_all.sh
-        popd
 
   test-pybind-build-macos:
     name: test-pybind-build-macos
@@ -235,8 +219,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 180
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
         bash .ci/scripts/setup-conda.sh
 
         # build module for executorch.extension.pybindings.portable_lib
@@ -245,7 +227,6 @@ jobs:
 
         # see if we can import the module successfully
         ${CONDA_RUN} python -c "from executorch.extension.pybindings import portable_lib; print('success!')"
-        popd
 
   test-llama-runner-macos:
     name: test-llama-runner-mac
@@ -254,7 +235,7 @@ jobs:
       matrix:
         dtype: [fp32]
         build-tool: [buck2, cmake]
-        mode: [portable, xnnpack]
+        mode: [portable, xnnpack+kv+custom]
       fail-fast: false
     with:
       runner: macos-m1-stable
@@ -263,8 +244,6 @@ jobs:
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 900
       script: |
-        WORKSPACE=$(pwd)
-        pushd "${WORKSPACE}/pytorch/executorch"
         bash .ci/scripts/setup-conda.sh
 
         DTYPE=${{ matrix.dtype }}
@@ -278,4 +257,3 @@ jobs:
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash examples/models/llama2/install_requirements.sh
         # Test llama2
         PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh stories110M.pt "${BUILD_TOOL}" "${DTYPE}" "${MODE}"
-        popd
@@ -175,15 +175,20 @@ option(EXECUTORCH_BUILD_VULKAN "Build the Vulkan backend" OFF)
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
 #
-cmake_dependent_option(EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library."
-                       ON "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
+cmake_dependent_option(
+  EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON
+  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
 
 #
 # cpuinfo: build cpuinfo library. Disable on unsupported platforms
 #
 cmake_dependent_option(EXECUTORCH_BUILD_CPUINFO "Build cpuinfo library." ON
                        "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
 
+if(EXECUTORCH_BUILD_CUSTOM)
+  set(EXECUTORCH_BUILD_OPTIMIZED ON)
+endif()
+
 if(EXECUTORCH_BUILD_CPUINFO)
   # --- cpuinfo
   set(CPUINFO_SOURCE_DIR "backends/xnnpack/third-party/cpuinfo")
@@ -508,24 +513,38 @@ if(EXECUTORCH_BUILD_PYBIND)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
   endif()
 
+  # find pytorch lib, to allow pybind to take at::Tensor as input/output
+  find_package(Torch CONFIG REQUIRED)
+  find_library(TORCH_PYTHON_LIBRARY torch_python
+               PATHS "${TORCH_INSTALL_PREFIX}/lib")
+
+  set(_dep_libs
+      ${TORCH_PYTHON_LIBRARY}
+      bundled_program
+      etdump
+      executorch
+      extension_data_loader
+      portable_ops_lib
+      util
+      torch)
+
   if(EXECUTORCH_BUILD_COREML)
-    set(PYBIND_LINK_COREML "coremldelegate")
+    list(APPEND _dep_libs coremldelegate)
   endif()
 
   if(EXECUTORCH_BUILD_MPS)
-    set(PYBIND_LINK_MPS "mpsdelegate")
+    list(APPEND _dep_libs mpsdelegate)
   endif()
 
   if(EXECUTORCH_BUILD_XNNPACK)
-    # need to explicitly specify XNNPACK here
-    # otherwise uses XNNPACK symbols from libtorch_cpu
-    set(PYBIND_LINK_XNNPACK xnnpack_backend XNNPACK)
+    # need to explicitly specify XNNPACK here otherwise uses XNNPACK symbols
+    # from libtorch_cpu
+    list(APPEND _dep_libs xnnpack_backend XNNPACK)
   endif()
 
-  # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  find_package(Torch CONFIG REQUIRED)
-  find_library(TORCH_PYTHON_LIBRARY torch_python
-               PATHS "${TORCH_INSTALL_PREFIX}/lib")
+  if(EXECUTORCH_BUILD_CUSTOM)
+    list(APPEND _dep_libs custom_ops_lib)
+  endif()
 
   # compile options for pybind
 
@@ -548,19 +567,7 @@ if(EXECUTORCH_BUILD_PYBIND)
                              PUBLIC EXECUTORCH_PYTHON_MODULE_NAME=portable_lib)
   target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
-  target_link_libraries(
-    portable_lib
-    PUBLIC ${TORCH_PYTHON_LIBRARY}
-           bundled_program
-           etdump
-           executorch
-           extension_data_loader
-           portable_ops_lib
-           util
-           torch
-           ${PYBIND_LINK_COREML}
-           ${PYBIND_LINK_MPS}
-           ${PYBIND_LINK_XNNPACK})
+  target_link_libraries(portable_lib PUBLIC ${_dep_libs})
 
   install(TARGETS portable_lib
           LIBRARY DESTINATION executorch/extension/pybindings)
 
@@ -40,6 +40,14 @@ def define_node(
         linear_input_tensors.append(input_tensor_wrapper)
 
         weight_node = node.args[1]
+        if (
+            quant_attrs := weight_node.meta.get("quant_attrs")
+        ) and "scales" in quant_attrs:
+            # Dimension of weight is [m, n], per channel quant params is [m]
+            # Change to [m, 1] to fit the tensor.div(s).add(z)
+            quant_attrs["scales"] = quant_attrs["scales"].reshape([-1, 1])
+            quant_attrs["zero_points"] = quant_attrs["zero_points"].reshape([-1, 1])
+
         weight_tensor = get_parameter(weight_node, self.edge_program)
         weight_tensor_wrapper = self.define_tensor(
             weight_node,
@@ -52,6 +60,12 @@ def define_node(
 
         if len(node.args) >= 3:
             bias_node = node.args[2]
+
+            # TODO remove this when qnn sdk support
+            if "scales" in bias_node.meta.get("quant_attrs"):
+                print(
+                    f"[WARNING] Fallback linear bias, {bias_node}. per channel bias quantization is not support yet."
+                )
             bias_tensor = get_parameter(bias_node, self.edge_program)
             bias_tensor_wrapper = self.define_tensor(
                 bias_node,