Skip to content

Commit dc9b241

Browse files
committed
Update on "[ExecuTorch] Precompute multiplicative inverse when possible in op_div"
Division is generally slower than multiplication in hardware. Differential Revision: [D62412539](https://our.internmc.facebook.com/intern/diff/D62412539/) [ghstack-poisoned]
2 parents 92e5a7e + e143f64 commit dc9b241

File tree

306 files changed

+7974
-1582
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

306 files changed

+7974
-1582
lines changed

.ci/scripts/build-qnn-sdk.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ set -o xtrace
1111
build_qnn_backend() {
1212
echo "Start building qnn backend."
1313
export ANDROID_NDK_ROOT=/opt/ndk
14-
export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531
14+
export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
1515
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
1616

1717
bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release

.ci/scripts/setup-qnn-deps.sh

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,18 @@
77

88
set -ex
99

10+
verify_pkg_installed() {
11+
echo $(dpkg-query -W --showformat='${Status}\n' $1|grep "install ok installed")
12+
}
13+
1014
install_qnn() {
1115
echo "Start installing qnn."
1216
QNN_INSTALLATION_DIR=/tmp/qnn
1317
mkdir -p "${QNN_INSTALLATION_DIR}"
1418

15-
curl -Lo /tmp/v2.23.0.24.06.24.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.23.0.24.06.24.zip"
19+
curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip"
1620
echo "Finishing downloading qnn sdk."
17-
unzip -qo /tmp/v2.23.0.24.06.24.zip -d /tmp
21+
unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp
1822
echo "Finishing unzip qnn sdk."
1923

2024

@@ -26,4 +30,22 @@ install_qnn() {
2630
ls -lah "${QNN_INSTALLATION_DIR}"
2731
}
2832

33+
setup_libc++() {
34+
sudo apt-get update
35+
pkgs_to_check=('libc++-dev')
36+
j=0
37+
while [ $j -lt ${#pkgs_to_check[*]} ]; do
38+
install_status=$(verify_pkg_installed ${pkgs_to_check[$j]})
39+
if [ "$install_status" == "" ]; then
40+
sudo apt-get install -y ${pkgs_to_check[$j]}
41+
if [[ $? -ne 0 ]]; then
42+
echo "ERROR: Failed to install required packages for libc++"
43+
exit 1
44+
fi
45+
fi
46+
j=$(( $j +1));
47+
done
48+
}
49+
50+
setup_libc++
2951
install_qnn

.ci/scripts/test_llama.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ echo "COREML option ${COREML}"
7575
if [[ "${MODE}" =~ .*qnn.* ]]; then
7676
QNN=ON
7777
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
78-
export QNN_SDK_ROOT=/tmp/qnn/2.23.0.240531
78+
export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
7979
export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
8080
export PYTHONPATH=".."
8181
cp schema/program.fbs exir/_serialize/program.fbs

.ci/scripts/test_llava.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ if hash nproc &> /dev/null; then NPROC=$(nproc); fi
3333
EXECUTORCH_COMMON_CMAKE_ARGS=" \
3434
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
3535
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
36+
-DEXECUTORCH_ENABLE_LOGGING=ON \
3637
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
3738
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
3839
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \

.ci/scripts/test_model.sh

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,13 @@ elif [[ "${BACKEND}" == "coreml" ]]; then
209209
fi
210210
elif [[ "${BACKEND}" == "xnnpack" ]]; then
211211
echo "Testing ${MODEL_NAME} with xnnpack..."
212-
test_model_with_xnnpack true true
212+
WITH_QUANTIZATION=true
213+
WITH_DELEGATION=true
214+
if [[ "$MODEL_NAME" == "mobilebert" ]]; then
215+
# TODO(T197452682)
216+
WITH_QUANTIZATION=false
217+
fi
218+
test_model_with_xnnpack "${WITH_QUANTIZATION}" "${WITH_DELEGATION}"
213219
if [[ $? -eq 0 ]]; then
214220
prepare_artifacts_upload
215221
fi

.github/workflows/android-perf.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ jobs:
178178
upload-models:
179179
needs: export-models
180180
runs-on: linux.2xlarge
181+
if: always() # Continue this job regardless of previous job outcome
181182
steps:
182183
- name: Download the models from GitHub
183184
uses: actions/download-artifact@v3

.github/workflows/apple-perf.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,8 @@ jobs:
165165
# Test llama2
166166
if [[ ${{ matrix.delegate }} == "xnnpack" ]]; then
167167
DELEGATE_CONFIG="xnnpack+custom+qe"
168+
elif [[ ${{ matrix.delegate }} == "coreml" ]]; then
169+
DELEGATE_CONFIG="coreml"
168170
fi
169171
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
170172
bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}"
@@ -177,6 +179,7 @@ jobs:
177179
upload-models:
178180
needs: export-models
179181
runs-on: linux.2xlarge
182+
if: always() # Continue this job regardless of previous job outcome
180183
steps:
181184
- name: Download the models from GitHub
182185
uses: actions/download-artifact@v3

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ option(EXECUTORCH_BUILD_KERNELS_OPTIMIZED "Build the optimized kernels" OFF)
197197

198198
option(EXECUTORCH_BUILD_KERNELS_QUANTIZED "Build the quantized kernels" OFF)
199199

200-
option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK")
200+
option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch Developer Tools")
201201

202202
option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF)
203203

CONTRIBUTING.md

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,9 +131,7 @@ for detailed advice.
131131

132132
#### C++ language version
133133

134-
**C++11.**
135-
136-
NOTE: The code does not yet fully conform to this, and some files require C++17.
134+
**C++17.**
137135

138136
Rationale: This is a compromise between being compatible with older, proprietary
139137
toolchains, and having access to relatively modern C++ features.

README.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ Key value propositions of ExecuTorch are:
1010
- **Portability:** Compatibility with a wide variety of computing platforms,
1111
from high-end mobile phones to highly constrained embedded systems and
1212
microcontrollers.
13-
- **Productivity:** Enabling developers to use the same toolchains and SDK from
14-
PyTorch model authoring and conversion, to debugging and deployment to a wide
15-
variety of platforms.
13+
- **Productivity:** Enabling developers to use the same toolchains and Developer
14+
Tools from PyTorch model authoring and conversion, to debugging and deployment
15+
to a wide variety of platforms.
1616
- **Performance:** Providing end users with a seamless and high-performance
1717
experience due to a lightweight runtime and utilizing full hardware
1818
capabilities such as CPUs, NPUs, and DSPs.

backends/apple/coreml/compiler/coreml_preprocess.py

Lines changed: 58 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# CoreML backend for delegating a EdgeProgram to CoreML.
44

55
import json
6+
import logging
67

78
import shutil
89
import uuid
@@ -14,6 +15,7 @@
1415
from typing import Any, Dict, final, List, Optional, Tuple
1516

1617
import coremltools as ct
18+
import coremltools.optimize as cto
1719
import executorchcoreml
1820

1921
from executorch.exir.backend.backend_details import (
@@ -23,12 +25,16 @@
2325
)
2426
from executorch.exir.backend.compile_spec_schema import CompileSpec
2527

28+
logger = logging.getLogger(__name__)
29+
logger.setLevel(logging.WARNING)
30+
2631

2732
class COMPILE_SPEC_KEYS(Enum):
2833
COMPUTE_UNITS = "compute_units"
2934
MODEL_TYPE = "model_type"
3035
MIN_DEPLOYMENT_TARGET = "min_deployment_target"
3136
MODEL_COMPUTE_PRECISION = "model_compute_precision"
37+
OP_LINEAR_QUANTIZER_CONFIG = "op_linear_quantizer_config"
3238

3339

3440
class MODEL_PATHS(Enum):
@@ -169,12 +175,44 @@ def generate_compute_unit_compile_spec(
169175
compute_unit.name.lower().encode("utf-8"),
170176
)
171177

178+
@staticmethod
179+
def generate_op_linear_quantizer_config_compile_spec(
180+
op_linear_quantizer_config: Dict,
181+
) -> CompileSpec:
182+
"""
183+
Returns the compile spec representing the model post conversion quantization,
184+
which is a dict that will construct cto.coreml.OpLinearQuantizerConfig
185+
"""
186+
str_representation = json.dumps(op_linear_quantizer_config)
187+
byte_representation = str_representation.encode("utf-8")
188+
return CompileSpec(
189+
COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value,
190+
byte_representation,
191+
)
192+
193+
@staticmethod
194+
def op_linear_quantizer_config_from_compile_specs(
195+
compile_specs: List[CompileSpec],
196+
) -> cto.coreml.OpLinearQuantizerConfig:
197+
"""
198+
Returns the model's post conversion quantization by parsing the list of compile specs.
199+
"""
200+
for compile_spec in compile_specs:
201+
if compile_spec.key == COMPILE_SPEC_KEYS.OP_LINEAR_QUANTIZER_CONFIG.value:
202+
config_dict_str = compile_spec.value.decode("utf-8")
203+
config_dict = json.loads(config_dict_str)
204+
config = cto.coreml.OpLinearQuantizerConfig._from_dict(config_dict)
205+
return config
206+
207+
return None
208+
172209
@staticmethod
173210
def generate_compile_specs(
174211
compute_unit: ct.ComputeUnit = ct.ComputeUnit.ALL,
175212
minimum_deployment_target: ct.target = ct.target.iOS15,
176213
compute_precision: ct.precision = ct.precision.FLOAT16,
177214
model_type: MODEL_TYPE = MODEL_TYPE.MODEL,
215+
op_linear_quantizer_config: Optional[Dict] = None,
178216
) -> List[CompileSpec]:
179217
"""
180218
Returns the list of compile specs that's used by CoreMLBackend to lower the module.
@@ -192,6 +230,12 @@ def generate_compile_specs(
192230
CoreMLBackend.generate_compute_precision_compile_spec(compute_precision)
193231
)
194232
compile_specs.append(CoreMLBackend.generate_model_type_compile_spec(model_type))
233+
if op_linear_quantizer_config is not None:
234+
compile_specs.append(
235+
CoreMLBackend.generate_op_linear_quantizer_config_compile_spec(
236+
op_linear_quantizer_config
237+
)
238+
)
195239

196240
return compile_specs
197241

@@ -368,18 +412,18 @@ def preprocess(
368412
compile_specs,
369413
)
370414
)
371-
372415
model_compute_precision: ct.precision = (
373416
CoreMLBackend.model_compute_precision_from_compile_specs(compile_specs)
374417
)
375-
376418
minimum_deployment_target: ct.target = (
377419
CoreMLBackend.min_deployment_target_from_compile_specs(compile_specs)
378420
)
379-
380421
compute_units: ct.ComputeUnit = CoreMLBackend.compute_unit_from_compile_specs(
381422
compile_specs
382423
)
424+
op_linear_quantizer_config = (
425+
CoreMLBackend.op_linear_quantizer_config_from_compile_specs(compile_specs)
426+
)
383427

384428
mlmodel = ct.convert(
385429
model=edge_program,
@@ -392,4 +436,15 @@ def preprocess(
392436
compute_units=compute_units,
393437
)
394438

439+
if op_linear_quantizer_config is not None:
440+
logger.warning(
441+
"Core ML Backend op_linear_quantizer_config API is experimental"
442+
)
443+
config = cto.coreml.OptimizationConfig(
444+
global_config=op_linear_quantizer_config,
445+
# skip embedding
446+
op_type_configs={"gather": None},
447+
)
448+
mlmodel = cto.coreml.linear_quantize_weights(mlmodel, config=config)
449+
395450
return CoreMLBackend.preprocess_model(mlmodel, model_type=model_type)

backends/apple/coreml/partition/coreml_partitioner.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
Partitioner,
1818
PartitionResult,
1919
)
20-
from executorch.exir.backend.utils import tag_constant_data
20+
from executorch.exir.backend.utils import tag_constant_data, tag_mutated_buffer
2121
from torch.export.exported_program import ExportedProgram
2222
from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
2323
from torch.fx.passes.operator_support import OperatorSupportBase
@@ -61,6 +61,7 @@ def __init__(
6161
self,
6262
skip_ops_for_coreml_delegation: Optional[List[str]] = None,
6363
compile_specs: Optional[List[CompileSpec]] = None,
64+
take_over_mutable_buffer: Optional[bool] = True,
6465
) -> None:
6566
if skip_ops_for_coreml_delegation is None:
6667
skip_ops_for_coreml_delegation = []
@@ -69,6 +70,7 @@ def __init__(
6970
backend_id=CoreMLBackend.__name__,
7071
compile_specs=compile_specs if compile_specs is not None else [],
7172
)
73+
self.take_over_mutable_buffer = take_over_mutable_buffer
7274

7375
def partition(self, exported_program: ExportedProgram) -> PartitionResult:
7476
# Run the CapabilityBasedPartitioner to return the largest possible
@@ -89,6 +91,15 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
8991
partition_tags[tag] = self.delegation_spec
9092

9193
tag_constant_data(exported_program)
94+
if self.take_over_mutable_buffer:
95+
logger.info(
96+
"Core ML partitioner will take over torch mutable buffer as Core ML state, "
97+
"so if your model contains mutable buffer, "
98+
"then you will need MacOS15+/iOS18+ to execute. "
99+
"If you want your mutable buffer model to be compatible with older OS, "
100+
"then please set `take_over_mutable_buffer=False`"
101+
)
102+
tag_mutated_buffer(exported_program)
92103

93104
return PartitionResult(
94105
tagged_exported_program=exported_program, partition_tags=partition_tags

backends/apple/coreml/scripts/install_requirements.sh

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ rm -rf "$COREML_DIR_PATH/third-party"
2424
mkdir "$COREML_DIR_PATH/third-party"
2525

2626
echo "${green}ExecuTorch: Cloning coremltools."
27-
git clone --depth 1 --branch 8.0b1 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
27+
git clone --depth 1 --branch 8.0b2 "https://github.com/apple/coremltools.git" $COREMLTOOLS_DIR_PATH
2828
cd $COREMLTOOLS_DIR_PATH
2929

3030
STATUS=$?
@@ -47,6 +47,11 @@ cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel
4747

4848
echo "${green}ExecuTorch: Installing coremltools."
4949
pip install "$COREMLTOOLS_DIR_PATH"
50+
# CoreMLTools have started supporting numpy 2.0,
51+
# but ExecuTorch example model test env is still using older transformers,
52+
# so for now we will need to downgrade numpy to 1.x
53+
# TODO: Remove this numpy downgrade once later transformers starts to be used
54+
pip install numpy==1.26.4
5055
STATUS=$?
5156
if [ $STATUS -ne 0 ]; then
5257
echo "${red}ExecuTorch: Failed to install coremltools."

0 commit comments

Comments
 (0)