Skip to content

Commit 18d44cd

Browse files
committed
Update base for Update on "[Executorch][quant] Optimize per channel dequantize"
When using quantized kv cache, dequantization routine takes significantly long. This diff just vectorizes dequant per channel for common case. Differential Revision: [D63338858](https://our.internmc.facebook.com/intern/diff/D63338858/) [ghstack-poisoned]
2 parents 5d9d688 + e172c5c commit 18d44cd

File tree

61 files changed

+1349
-536
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+1349
-536
lines changed
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
name: Android Release Artifacts
2+
3+
on:
4+
workflow_dispatch:
5+
inputs:
6+
version:
7+
description: Version name to be uploaded for AAR release
8+
required: false
9+
type: string
10+
11+
concurrency:
12+
group: ${{ github.workflow }}-${{ github.ref }}
13+
cancel-in-progress: true
14+
15+
jobs:
16+
build-aar:
17+
name: build-aar
18+
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
19+
with:
20+
runner: linux.2xlarge
21+
docker-image: executorch-ubuntu-22.04-clang12-android
22+
submodules: 'true'
23+
ref: ${{ github.sha }}
24+
timeout: 90
25+
upload-artifact: android-apps
26+
upload-artifact-to-s3: true
27+
script: |
28+
set -eux
29+
30+
# The generic Linux job chooses to use base env, not the one setup by the image
31+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
32+
conda activate "${CONDA_ENV}"
33+
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh buck2
34+
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
35+
36+
# Build LLM Demo for Android
37+
bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
38+
39+
shasum -a 256 "${ARTIFACTS_DIR_NAME}/llm_demo/executorch.aar"
40+
41+
upload-release-aar:
42+
name: upload-release-aar
43+
needs: build-aar
44+
runs-on: ubuntu-22.04
45+
timeout-minutes: 10
46+
permissions:
47+
id-token: write
48+
contents: read
49+
steps:
50+
- name: configure aws credentials
51+
uses: aws-actions/[email protected]
52+
with:
53+
role-to-assume: arn:aws:iam::308535385114:role/gha_executorch_upload-frameworks-android
54+
aws-region: us-east-1
55+
- name: Upload AAR RC to AWS S3
56+
shell: bash
57+
run: |
58+
wget https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/llm_demo/executorch.aar
59+
shasum -a 256 executorch.aar > executorch.aar.sha256sums
60+
61+
pip install awscli==1.32.18
62+
AWS_CMD="aws s3 cp"
63+
VERSION="${{ inputs.version }}"
64+
VERSION_NAME="${VERSION:-temp_snapshot}"
65+
${AWS_CMD} executorch.aar s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar --acl public-read
66+
${AWS_CMD} executorch.aar.sha256sums s3://ossci-android/executorch/release/${VERSION_NAME}/executorch.aar.sha256sums --acl public-read

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-st
2424

2525
Check out the examples of [Llama](./examples/models/llama2/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
2626

27+
28+
**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama2/README.md) models via ExecuTorch.
29+
2730
## Feedback
2831

2932
We welcome any feedback, suggestions, and bug reports from the community to help

backends/cadence/CMakeLists.txt

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@ if(NOT EXECUTORCH_ROOT)
2020
endif()
2121

2222
include(${EXECUTORCH_ROOT}/build/Utils.cmake)
23-
include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
2423

2524
# Let files say "include <executorch/path/to/header.h>".
2625
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
@@ -30,54 +29,6 @@ if(EXECUTORCH_NNLIB_OPT)
3029
set(TARGET_DIR hifi)
3130
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/third-party/nnlib)
3231
endif()
33-
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
34-
35-
# Source root directory for executorch.
36-
if(NOT EXECUTORCH_ROOT)
37-
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
38-
endif()
39-
40-
if(NOT PYTHON_EXECUTABLE)
41-
resolve_python_executable()
42-
endif()
43-
44-
set(_common_compile_options -Wno-deprecated-declarations -fPIC)
45-
46-
# Find prebuilt libraries. executorch package should contain portable_ops_lib,
47-
# etdump, bundled_program.
48-
find_package(executorch CONFIG REQUIRED)
49-
target_link_options_shared_lib(executorch)
50-
target_link_options_shared_lib(portable_ops_lib)
51-
52-
target_include_directories(executorch INTERFACE ${_common_include_directories})
53-
54-
find_package(
55-
gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../third-party
56-
)
57-
58-
add_executable(cadence_runner cadence_runner/cadence_runner.cpp)
59-
target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
6032

6133
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators)
6234
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
63-
64-
target_include_directories(
65-
etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../sdk/include
66-
${EXECUTORCH_ROOT}/third-party/flatcc/include
67-
)
68-
69-
target_include_directories(
70-
cadence_runner PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR}
71-
${_common_include_directories}
72-
)
73-
74-
target_link_libraries(
75-
cadence_runner
76-
executorch
77-
gflags
78-
etdump
79-
extension_data_loader
80-
bundled_program
81-
cadence_ops_lib
82-
flatccrt
83-
)

backends/cadence/build_cadence_xtensa.sh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -65,20 +65,19 @@ else
6565
-DEXECUTORCH_BUILD_HOST_TARGETS=ON \
6666
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=ON \
6767
-DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
68-
-DEXECUTORCH_BUILD_CADENCE=OFF \
68+
-DEXECUTORCH_BUILD_CPUINFO=OFF \
69+
-DEXECUTORCH_BUILD_FLATC=OFF \
70+
-DEXECUTORCH_BUILD_CADENCE=ON \
6971
-DFLATC_EXECUTABLE="$(which flatc)" \
72+
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
7073
-DEXECUTORCH_ENABLE_LOGGING=ON \
7174
-DEXECUTORCH_ENABLE_PROGRAM_VERIFICATION=ON \
7275
-DEXECUTORCH_USE_DL=OFF \
7376
-DBUILD_EXECUTORCH_PORTABLE_OPS=ON \
7477
-DEXECUTORCH_BUILD_KERNELS_CUSTOM=OFF \
7578
-DPYTHON_EXECUTABLE=python3 \
7679
-DEXECUTORCH_NNLIB_OPT=ON \
77-
-DEXECUTORCH_BUILD_GFLAGS=ON \
7880
-DHAVE_FNMATCH_H=OFF \
79-
-DEXECUTORCH_ENABLE_EVENT_TRACER=OFF \
80-
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
81-
-DEXECUTORCH_BUILD_CPUINFO=OFF \
8281
-Bcmake-out
8382
cmake --build cmake-out --target install --config Release -j16
8483
fi
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
# Set the minimum required version of CMake for this project.
8+
cmake_minimum_required(VERSION 3.10)
9+
10+
if(NOT CMAKE_CXX_STANDARD)
11+
set(CMAKE_CXX_STANDARD 17)
12+
endif()
13+
14+
# Set the project name.
15+
project(cadence_backend)
16+
17+
# Source root directory for executorch.
18+
if(NOT EXECUTORCH_ROOT)
19+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
20+
endif()
21+
22+
include(${EXECUTORCH_ROOT}/build/Utils.cmake)
23+
include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
24+
25+
# Let files say "include <executorch/path/to/header.h>".
26+
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
27+
set(TARGET_DIR reference)
28+
29+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
30+
31+
if(NOT PYTHON_EXECUTABLE)
32+
resolve_python_executable()
33+
endif()
34+
35+
# Find prebuilt libraries. executorch package should contain portable_ops_lib,
36+
# etdump, bundled_program.
37+
find_package(executorch CONFIG REQUIRED)
38+
target_link_options_shared_lib(executorch)
39+
target_link_options_shared_lib(portable_ops_lib)
40+
41+
target_include_directories(executorch INTERFACE ${_common_include_directories})
42+
43+
find_package(
44+
gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party
45+
)
46+
47+
add_executable(cadence_runner cadence_runner.cpp)
48+
target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
49+
50+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/operators)
51+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${TARGET_DIR}/kernels)
52+
53+
target_include_directories(
54+
etdump INTERFACE ${CMAKE_CURRENT_BINARY_DIR}/../../../devtools/include
55+
${EXECUTORCH_ROOT}/third-party/flatcc/include
56+
)
57+
58+
target_include_directories(
59+
cadence_runner PUBLIC ${ROOT_DIR}/../.. ${CMAKE_BINARY_DIR}
60+
${_common_include_directories}
61+
)
62+
63+
target_link_libraries(
64+
cadence_runner
65+
executorch
66+
gflags
67+
etdump
68+
extension_data_loader
69+
bundled_program
70+
cadence_ops_lib
71+
flatccrt
72+
)

backends/cadence/build_cadence_runner.sh renamed to backends/cadence/cadence_runner/build_cadence_runner.sh

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ set -euo pipefail
1212
SCRIPT_DIR="$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
1313
readonly SCRIPT_DIR
1414

15-
readonly EXECUTORCH_ROOT="${SCRIPT_DIR}/../.."
15+
readonly EXECUTORCH_ROOT="${SCRIPT_DIR}/../../.."
1616

1717
# Allow overriding the number of build jobs. Default to 9.
1818
export CMAKE_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-9}"
@@ -32,8 +32,9 @@ main() {
3232
-DEXECUTORCH_BUILD_PTHREADPOOL=OFF \
3333
-DEXECUTORCH_BUILD_CPUINFO=OFF \
3434
-DEXECUTORCH_ENABLE_LOGGING=ON \
35-
-Bcmake-out .
36-
cmake --build cmake-out --target install --config Release
35+
-DEXECUTORCH_NNLIB_OPT=OFF \
36+
-Bcmake-out
37+
cmake --build cmake-out --target install --config Release -j16
3738

3839
local example_dir=backends/cadence
3940
local build_dir="cmake-out/${example_dir}"
@@ -43,7 +44,7 @@ main() {
4344
-DCMAKE_BUILD_TYPE=Release \
4445
-B"${build_dir}" \
4546
"${example_dir}"
46-
cmake --build "${build_dir}" --config Release
47+
cmake --build "${build_dir}" --config Release -j16
4748

4849
local runner="${PWD}/${build_dir}/cadence_runner"
4950
if [[ ! -f "${runner}" ]]; then

backends/cadence/hifi/kernels/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ add_library(
1010
kernels.cpp
1111
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
1212
)
13+
# Let files say "include <executorch/path/to/header.h>".
14+
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
1315

1416
target_include_directories(
1517
cadence_kernels
@@ -19,6 +21,7 @@ target_include_directories(
1921
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include/nnlib
2022
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/include
2123
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4/xa_nnlib/algo/ndsp/hifi4/include/
24+
${_common_include_directories}
2225
)
2326

2427
target_link_libraries(cadence_kernels PRIVATE xa_nnlib)

backends/cadence/hifi/operators/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ set(_aten_ops__srcs
2828
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
2929
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
3030
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
31+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
3132
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_add.cpp"
3233
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
3334
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"

backends/cadence/reference/kernels/CMakeLists.txt

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,9 @@
77
# lint_cmake: -linelength
88
add_library(cadence_kernels kernels.cpp)
99

10-
target_include_directories(cadence_kernels PUBLIC .)
10+
# Let files say "include <executorch/path/to/header.h>".
11+
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
12+
13+
target_include_directories(cadence_kernels PUBLIC .
14+
${_common_include_directories}
15+
)

backends/cadence/reference/kernels/kernels.cpp

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <math.h>
1010
#include <algorithm>
1111
#include <cstring>
12+
#include <limits>
1213
#include <numeric>
1314

1415
namespace impl {
@@ -17,8 +18,7 @@ namespace kernels {
1718

1819
// Quantize a fp32 value to an int8_t/uint8_t value
1920
template <typename T>
20-
__attribute__((always_inline)) T
21-
quantize(const float x, float scale, int32_t zero_point) {
21+
T quantize(const float x, float scale, int32_t zero_point) {
2222
constexpr float min_val = std::numeric_limits<T>::min();
2323
constexpr float max_val = std::numeric_limits<T>::max();
2424
float tmp = roundf(x * scale + zero_point);
@@ -40,8 +40,7 @@ void quantize(
4040

4141
// Dequantize an int8_t/uint8_t value to an fp32 value
4242
template <typename T>
43-
__attribute__((always_inline)) float
44-
dequantize(const T x, float scale, int32_t zero_point) {
43+
float dequantize(const T x, float scale, int32_t zero_point) {
4544
return scale * (x - zero_point);
4645
}
4746

@@ -60,9 +59,8 @@ void dequantize(
6059

6160
// explicit template instantiation
6261

63-
#define typed_quantize_val(dtype) \
64-
template __attribute__((always_inline)) dtype quantize( \
65-
const float x, float inv_scale, int32_t zero_point);
62+
#define typed_quantize_val(dtype) \
63+
template dtype quantize(const float x, float inv_scale, int32_t zero_point);
6664
typed_quantize_val(int8_t);
6765
typed_quantize_val(uint8_t);
6866
typed_quantize_val(int16_t);
@@ -82,9 +80,8 @@ typed_quantize_vec(int16_t);
8280
typed_quantize_vec(int32_t);
8381
#undef typed_quantize_vec
8482

85-
#define typed_dequantize_val(dtype) \
86-
template __attribute__((always_inline)) float dequantize( \
87-
const dtype x, float scale, int32_t zero_point);
83+
#define typed_dequantize_val(dtype) \
84+
template float dequantize(const dtype x, float scale, int32_t zero_point);
8885
typed_dequantize_val(int8_t);
8986
typed_dequantize_val(uint8_t);
9087
typed_dequantize_val(int16_t);

backends/cadence/reference/operators/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ set(_aten_ops__srcs
3232
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp"
3333
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp"
3434
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp"
35+
"${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp"
3536
"${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhb_to_floath.cpp"
3637
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp"
3738
"${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp"

backends/cadence/runtime/executor.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,9 @@ def __init__(
106106
working_dir: str = "",
107107
):
108108
self.working_dir = working_dir
109-
self.executor_builder = "./backends/cadence/build_cadence_runner.sh"
109+
self.executor_builder = (
110+
"./backends/cadence/cadence_runner/build_cadence_runner.sh"
111+
)
110112
self.execute_runner = "./cmake-out/backends/cadence/cadence_runner"
111113
self.bundled_program_path: str = "CadenceDemoModel.bpte"
112114

backends/qualcomm/aot/python/TARGETS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
load(":targets.bzl", "define_common_targets")
2+
3+
oncall("executorch")
4+
5+
define_common_targets()

0 commit comments

Comments
 (0)