Skip to content

Commit f29c7f8

Browse files
committed
Update base for Update on "[executorch][weight sharing] Introduce NamedData to PTE schema"
See 'Schema Changes' in the [RFC]( Differential Revision: [D69430152](https://our.internmc.facebook.com/intern/diff/D69430152/) [ghstack-poisoned]
2 parents a777e49 + 9c12c8f commit f29c7f8

File tree

20 files changed

+135
-64
lines changed

20 files changed

+135
-64
lines changed

.github/workflows/_android.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ jobs:
2929
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
3030
3131
# Build LLM Demo for Android
32-
bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
32+
bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
3333
3434
# Running Android emulator directly on the runner and not using Docker
3535
run-emulator:

.github/workflows/android-perf.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -363,7 +363,7 @@ jobs:
363363
PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
364364
365365
export ANDROID_ABIS="arm64-v8a"
366-
PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
366+
PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
367367
368368
# Let's see how expensive this job is, we might want to tone it down by running it periodically
369369
benchmark-on-device:

.github/workflows/android-release-artifacts.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ jobs:
5353
export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
5454
5555
# Build LLM Demo for Android
56-
bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
56+
bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
5757
5858
shasum -a 256 "${ARTIFACTS_DIR_NAME}/llm_demo/executorch.aar"
5959

backends/xnnpack/test/TARGETS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ runtime.python_test(
5353
srcs = glob([
5454
"models/*.py",
5555
]),
56-
tags = ["long_running"],
56+
labels = ["long_running"],
5757
deps = [
5858
"fbsource//third-party/pypi/timm:timm",
5959
"fbsource//third-party/pypi/torchsr:torchsr", # @manual
File renamed without changes.

docs/source/backends-xnnpack.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,4 +121,3 @@ target_link_libraries(
121121
```
122122

123123
No additional steps are necessary to use the backend beyond linking the target. Any XNNPACK-delegated .pte file will automatically run on the registered backend.
124-

examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ The Mediatek runner (`examples/mediatek/executor_runner/mtk_llama_runner.cpp`) c
123123

124124
Next we need to build and compile the MediaTek backend and MediaTek Llama runner. By setting `NEURON_BUFFER_ALLOCATOR_LIB`, the script will build the MediaTek backend.
125125
```
126-
sh build/build_android_llm_demo.sh
126+
sh build/build_android_library.sh
127127
```
128128

129129
**Output**: This will generate an .aar file that is already imported into the expected directory for the Android app. It will live in `examples/demo-apps/android/Llamademo/app/libs`.

examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ if [ -z "$QNN_SDK_ROOT" ]; then
1313
fi
1414

1515
BASEDIR=$(dirname "$0")
16-
source "$BASEDIR"/../../../../build/build_android_llm_demo.sh
16+
source "$BASEDIR"/../../../../build/build_android_library.sh
1717

1818
BUILD_AAR_DIR="$(mktemp -d)"
1919
export BUILD_AAR_DIR

examples/demo-apps/android/LlamaDemo/setup.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
set -eu
99

1010
BASEDIR=$(dirname "$0")
11-
source "$BASEDIR"/../../../../build/build_android_llm_demo.sh
11+
source "$BASEDIR"/../../../../build/build_android_library.sh
1212

1313
BUILD_AAR_DIR="$(mktemp -d)"
1414
export BUILD_AAR_DIR

examples/models/llama/export_llama_lib.py

Lines changed: 91 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -676,47 +676,62 @@ def _validate_args(args):
676676
)
677677

678678

679-
def _export_llama(args) -> LLMEdgeManager: # noqa: C901
680-
_validate_args(args)
681-
682-
pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
683-
684-
# export_to_edge
685-
builder_exported = _prepare_for_llama_export(args).export()
686-
687-
builder_exported.run_canonical_optimizations()
688-
689-
if args.export_only:
690-
exit()
691-
692-
builder_exported_to_edge = builder_exported.pt2e_quantize(
693-
quantizers
694-
).export_to_edge()
695-
696-
modelname = builder_exported_to_edge.modelname
697-
698-
# to_backend
679+
def _to_edge_and_lower_llama_xnnpack(
680+
builder_exported,
681+
modelname,
682+
additional_passes,
683+
pt2e_quant_params,
684+
quantizers,
685+
quant_dtype,
686+
args,
687+
) -> LLMEdgeManager: # noqa: C901
699688
partitioners = []
700689

701690
# Order matters here, dynamic quantization should be applied first when both xnnpack and xnnpack_extended_ops are enabled
702-
if (
703-
pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None
704-
) or (args.xnnpack):
705-
partitioners.append(
706-
get_xnnpack_partitioner(dynamic_quant_only_partitioner=True)
707-
)
691+
partitioners.append(get_xnnpack_partitioner(dynamic_quant_only_partitioner=True))
708692

709-
# force xnnpack to be true if pt2e_quant_params is not None and args.xnnpack is False
710-
args.xnnpack = True
711-
modelname = f"xnnpack_dq_{modelname}"
693+
modelname = f"xnnpack_dq_{modelname}"
712694

713695
if args.xnnpack_extended_ops:
714-
assert args.xnnpack, "xnnpack_extended_ops requires xnnpack to be enabled"
715696
partitioners.append(
716697
get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
717698
)
718699
modelname = f"xnnpack_{modelname}"
719700

701+
logging.info("Lowering model using following partitioner(s): ")
702+
for partitioner in partitioners:
703+
logging.info(f"--> {partitioner.__class__.__name__}")
704+
705+
# TODO: Enable generating ETRecord with XNNPack and to_edge_transform_and_lower().
706+
if args.generate_etrecord:
707+
raise NotImplementedError(
708+
"export_llama does not support XNNPack and generating ETRecord at the moment."
709+
)
710+
711+
builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
712+
partitioners
713+
)
714+
if args.verbose:
715+
print_delegation_info(builder.edge_manager.exported_program().graph_module)
716+
717+
return builder.to_executorch(passes=additional_passes)
718+
719+
720+
def _to_edge_and_lower_llama( # noqa: C901
721+
builder_exported,
722+
modelname,
723+
additional_passes,
724+
pt2e_quant_params,
725+
quantizers,
726+
quant_dtype,
727+
args,
728+
):
729+
builder_exported_to_edge = builder_exported.pt2e_quantize(
730+
quantizers
731+
).export_to_edge()
732+
733+
# to_backend
734+
partitioners = []
720735
if args.vulkan:
721736
partitioners.append(
722737
get_vulkan_partitioner(
@@ -731,7 +746,6 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901
731746
modelname = f"vulkan_{modelname}"
732747

733748
# Need to remove asserts from the graph to prevent graph breaks
734-
# pyre-ignore: Undefined attribute [16]: `Optional` has no attribute `exported_program`.
735749
remove_asserts(builder_exported_to_edge.edge_manager.exported_program())
736750

737751
if args.mps:
@@ -760,13 +774,11 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901
760774
# pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
761775
from executorch.backends.qualcomm.utils.utils import _transform, tag_quant_io
762776

763-
# pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`, Optional type has no attribute `exported_program`
764777
_transform(builder_exported_to_edge.edge_manager.exported_program())
765778

766779
if args.num_sharding > 0:
767780
model_sharding.split_graph(
768781
builder_exported_to_edge.edge_manager.exported_program(),
769-
# pyre-fixme[16]: `Optional` has no attribute `__getitem__`.
770782
builder_exported_to_edge.metadata["get_n_layers"],
771783
shares=args.num_sharding,
772784
)
@@ -792,19 +804,15 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901
792804
atten.head_dim,
793805
)
794806
)
795-
# pyre-ignore
796807
tag_quant_io(
797808
builder_exported_to_edge.edge_manager.exported_program().graph_module,
798-
partial(get_custom_quant_ios_dtype, cache_shape), # pyre-ignore
809+
partial(get_custom_quant_ios_dtype, cache_shape),
799810
)
800811

801812
logging.info("Lowering model using following partitioner(s): ")
802813
for partitioner in partitioners:
803814
logging.info(f"--> {partitioner.__class__.__name__}")
804815

805-
additional_passes = []
806-
if args.model in TORCHTUNE_DEFINED_MODELS:
807-
additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
808816
if args.generate_etrecord:
809817
if not builder_exported_to_edge.edge_manager:
810818
raise ValueError("Unable to generate etrecord due to missing edge manager.")
@@ -818,7 +826,6 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901
818826
if args.num_sharding > 0 and args.qnn:
819827
from executorch.backends.qualcomm.utils.utils import canonicalize_program
820828

821-
# pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
822829
canonicalize_program(builder.edge_manager.exported_program())
823830

824831
builder = builder.to_executorch(
@@ -840,11 +847,55 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901
840847
if args.num_sharding > 0 and args.qnn:
841848
from executorch.backends.qualcomm.utils.utils import canonicalize_program
842849

843-
# pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
844850
canonicalize_program(builder.edge_manager.exported_program())
845851

846852
builder = builder.to_executorch(passes=additional_passes)
847853

854+
return builder
855+
856+
857+
def _export_llama(args) -> LLMEdgeManager: # noqa: C901
858+
_validate_args(args)
859+
860+
pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
861+
862+
additional_passes = []
863+
if args.model in TORCHTUNE_DEFINED_MODELS:
864+
additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
865+
866+
# export_to_edge
867+
builder_exported = _prepare_for_llama_export(args).export()
868+
builder_exported.run_canonical_optimizations()
869+
modelname = builder_exported.modelname
870+
871+
if args.export_only:
872+
exit()
873+
874+
if pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None:
875+
# Force xnnpack to be true if pt2e_quant_params is not None and args.xnnpack is False
876+
args.xnnpack = True
877+
878+
if args.xnnpack:
879+
builder = _to_edge_and_lower_llama_xnnpack(
880+
builder_exported,
881+
modelname,
882+
additional_passes,
883+
pt2e_quant_params,
884+
quantizers,
885+
quant_dtype,
886+
args,
887+
)
888+
else:
889+
builder = _to_edge_and_lower_llama(
890+
builder_exported,
891+
modelname,
892+
additional_passes,
893+
pt2e_quant_params,
894+
quantizers,
895+
quant_dtype,
896+
args,
897+
)
898+
848899
if args.profile_memory:
849900
generate_memory_trace(builder.export_program, "memory_profile.json")
850901

@@ -866,7 +917,6 @@ def _export_llama(args) -> LLMEdgeManager: # noqa: C901
866917
output_file = f"{builder.output_dir}/{modelname}.pte"
867918

868919
builder.save_to_pte(output_file)
869-
870920
return builder
871921

872922

examples/models/llama/source_transformation/quantize.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
import torch.nn as nn
1515
import torch.nn.functional as F
1616

17-
from executorch.backends.vulkan._passes import VkInt4WeightOnlyQuantizer
18-
1917
from executorch.extension.llm.export.builder import DType
2018

2119
from sentencepiece import SentencePieceProcessor
@@ -180,6 +178,8 @@ def quantize( # noqa C901
180178
model = gptq_quantizer.quantize(model, inputs)
181179
return model
182180
elif qmode == "vulkan_4w":
181+
from executorch.backends.vulkan._passes import VkInt4WeightOnlyQuantizer
182+
183183
q_group_size = 256 if group_size is None else group_size
184184
model = VkInt4WeightOnlyQuantizer(groupsize=q_group_size).quantize(model)
185185

examples/models/llava/export_llava.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ def export(self) -> "LlavaEdgeManager":
6767
dynamic_shapes=dynamic_shape,
6868
strict=False,
6969
)
70-
# pyre-ignore: Incompatible attribute type [8]: Attribute `pre_autograd_graph_module` declared in class `LLMEdgeManager` has type `Optional[GraphModule]` but is used as type `Module`.
7170
self.pre_autograd_graph_module = self.export_program.module()
7271
return self
7372

exir/dialects/edge/test/TARGETS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ python_unittest(
1010
resources = {
1111
"//executorch/exir/dialects/edge:edge_yaml": "edge.yaml",
1212
},
13-
tags = ["long_running"],
13+
labels = ["long_running"],
1414
deps = [
1515
"fbsource//third-party/pypi/expecttest:expecttest", # @manual
1616
"//caffe2:torch",

extension/android_test/setup.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ BUILD_AAR_DIR="$(mktemp -d)"
1111
export BUILD_AAR_DIR
1212

1313
BASEDIR=$(dirname "$0")
14-
source "$BASEDIR"/../../build/build_android_llm_demo.sh
14+
source "$BASEDIR"/../../build/build_android_library.sh
1515

1616
build_native_library() {
1717
ANDROID_ABI="$1"

extension/benchmark/android/benchmark/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ Minibench is usedful for giving reference performance data when developers integ
1515
You will need executorch AAR for Java and JNI dependencies.
1616
```
1717
export ANDROID_NDK=<path_to_android_ndk>
18-
sh build/build_android_llm_demo.sh
18+
sh build/build_android_library.sh
1919
```
2020
and copy the AAR to `app/libs`.
2121
```

extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ protected void onCreate(Bundle savedInstanceState) {
4444
.get();
4545

4646
int numIter = intent.getIntExtra("num_iter", 50);
47-
int numWarmupIter = intent.getIntExtra("num_warm_up_iter", 5);
47+
int numWarmupIter = intent.getIntExtra("num_warm_up_iter", 10);
4848

4949
// TODO: Format the string with a parsable format
5050
Stats stats = new Stats();

0 commit comments

Comments
 (0)