Update base for Update on "[executorch][weight sharing] Introduce NamedData to PTE schema"

lucylq · lucylq · commit f29c7f883e49 · 2025-02-27T09:44:04.000-08:00
See 'Schema Changes' in the [RFC]( Differential Revision: [D69430152](https://our.internmc.facebook.com/intern/diff/D69430152/) [ghstack-poisoned]
diff --git a/.github/workflows/_android.yml b/.github/workflows/_android.yml
@@ -29,7 +29,7 @@ jobs:
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
         # Build LLM Demo for Android
-        bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
 
   # Running Android emulator directly on the runner and not using Docker
   run-emulator:
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -363,7 +363,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         export ANDROID_ABIS="arm64-v8a"
-        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
   benchmark-on-device:
diff --git a/.github/workflows/android-release-artifacts.yml b/.github/workflows/android-release-artifacts.yml
@@ -53,7 +53,7 @@ jobs:
         export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
 
         # Build LLM Demo for Android
-        bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        bash build/build_android_library.sh ${ARTIFACTS_DIR_NAME}
 
         shasum -a 256 "${ARTIFACTS_DIR_NAME}/llm_demo/executorch.aar"
 
diff --git a/backends/xnnpack/test/TARGETS b/backends/xnnpack/test/TARGETS
@@ -53,7 +53,7 @@ runtime.python_test(
     srcs = glob([
         "models/*.py",
     ]),
-    tags = ["long_running"],
+    labels = ["long_running"],
     deps = [
         "fbsource//third-party/pypi/timm:timm",
         "fbsource//third-party/pypi/torchsr:torchsr",  # @manual
diff --git a/build/build_android_library.sh b/build/build_android_library.sh
diff --git a/docs/source/backends-xnnpack.md b/docs/source/backends-xnnpack.md
@@ -121,4 +121,3 @@ target_link_libraries(
 ```
 
 No additional steps are necessary to use the backend beyond linking the target. Any XNNPACK-delegated .pte file will automatically run on the registered backend.
-
diff --git a/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md b/examples/demo-apps/android/LlamaDemo/docs/delegates/mediatek_README.md
@@ -123,7 +123,7 @@ The Mediatek runner (`examples/mediatek/executor_runner/mtk_llama_runner.cpp`) c
 
 Next we need to build and compile the MediaTek backend and MediaTek Llama runner. By setting  `NEURON_BUFFER_ALLOCATOR_LIB`, the script will build the MediaTek backend.
 ```
-sh build/build_android_llm_demo.sh
+sh build/build_android_library.sh
 ```
 
 **Output**: This will generate an .aar file that is already imported into the expected directory for the Android app. It will live in `examples/demo-apps/android/Llamademo/app/libs`.
diff --git a/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh b/examples/demo-apps/android/LlamaDemo/setup-with-qnn.sh
@@ -13,7 +13,7 @@ if [ -z "$QNN_SDK_ROOT" ]; then
 fi
 
 BASEDIR=$(dirname "$0")
-source "$BASEDIR"/../../../../build/build_android_llm_demo.sh
+source "$BASEDIR"/../../../../build/build_android_library.sh
 
 BUILD_AAR_DIR="$(mktemp -d)"
 export BUILD_AAR_DIR
diff --git a/examples/demo-apps/android/LlamaDemo/setup.sh b/examples/demo-apps/android/LlamaDemo/setup.sh
@@ -8,7 +8,7 @@
 set -eu
 
 BASEDIR=$(dirname "$0")
-source "$BASEDIR"/../../../../build/build_android_llm_demo.sh
+source "$BASEDIR"/../../../../build/build_android_library.sh
 
 BUILD_AAR_DIR="$(mktemp -d)"
 export BUILD_AAR_DIR
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -676,47 +676,62 @@ def _validate_args(args):
             )
 
 
-def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
-    _validate_args(args)
-
-    pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
-
-    # export_to_edge
-    builder_exported = _prepare_for_llama_export(args).export()
-
-    builder_exported.run_canonical_optimizations()
-
-    if args.export_only:
-        exit()
-
-    builder_exported_to_edge = builder_exported.pt2e_quantize(
-        quantizers
-    ).export_to_edge()
-
-    modelname = builder_exported_to_edge.modelname
-
-    # to_backend
+def _to_edge_and_lower_llama_xnnpack(
+    builder_exported,
+    modelname,
+    additional_passes,
+    pt2e_quant_params,
+    quantizers,
+    quant_dtype,
+    args,
+) -> LLMEdgeManager:  # noqa: C901
     partitioners = []
 
     # Order matters here, dynamic quantization should be applied first when both xnnpack and xnnpack_extended_ops are enabled
-    if (
-        pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None
-    ) or (args.xnnpack):
-        partitioners.append(
-            get_xnnpack_partitioner(dynamic_quant_only_partitioner=True)
-        )
+    partitioners.append(get_xnnpack_partitioner(dynamic_quant_only_partitioner=True))
 
-        # force xnnpack to be true if pt2e_quant_params is not None and args.xnnpack is False
-        args.xnnpack = True
-        modelname = f"xnnpack_dq_{modelname}"
+    modelname = f"xnnpack_dq_{modelname}"
 
     if args.xnnpack_extended_ops:
-        assert args.xnnpack, "xnnpack_extended_ops requires xnnpack to be enabled"
         partitioners.append(
             get_xnnpack_partitioner(dynamic_quant_only_partitioner=False)
         )
         modelname = f"xnnpack_{modelname}"
 
+    logging.info("Lowering model using following partitioner(s): ")
+    for partitioner in partitioners:
+        logging.info(f"--> {partitioner.__class__.__name__}")
+
+    # TODO: Enable generating ETRecord with XNNPack and to_edge_transform_and_lower().
+    if args.generate_etrecord:
+        raise NotImplementedError(
+            "export_llama does not support XNNPack and generating ETRecord at the moment."
+        )
+
+    builder = builder_exported.pt2e_quantize(quantizers).to_edge_transform_and_lower(
+        partitioners
+    )
+    if args.verbose:
+        print_delegation_info(builder.edge_manager.exported_program().graph_module)
+
+    return builder.to_executorch(passes=additional_passes)
+
+
+def _to_edge_and_lower_llama(  # noqa: C901
+    builder_exported,
+    modelname,
+    additional_passes,
+    pt2e_quant_params,
+    quantizers,
+    quant_dtype,
+    args,
+):
+    builder_exported_to_edge = builder_exported.pt2e_quantize(
+        quantizers
+    ).export_to_edge()
+
+    # to_backend
+    partitioners = []
     if args.vulkan:
         partitioners.append(
             get_vulkan_partitioner(
@@ -731,7 +746,6 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         modelname = f"vulkan_{modelname}"
 
         # Need to remove asserts from the graph to prevent graph breaks
-        # pyre-ignore: Undefined attribute [16]: `Optional` has no attribute `exported_program`.
         remove_asserts(builder_exported_to_edge.edge_manager.exported_program())
 
     if args.mps:
@@ -760,13 +774,11 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         # pyre-ignore: Undefined import [21]: Could not find a module corresponding to import `executorch.backends.qualcomm.utils.utils`
         from executorch.backends.qualcomm.utils.utils import _transform, tag_quant_io
 
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`, Optional type has no attribute `exported_program`
         _transform(builder_exported_to_edge.edge_manager.exported_program())
 
         if args.num_sharding > 0:
             model_sharding.split_graph(
                 builder_exported_to_edge.edge_manager.exported_program(),
-                # pyre-fixme[16]: `Optional` has no attribute `__getitem__`.
                 builder_exported_to_edge.metadata["get_n_layers"],
                 shares=args.num_sharding,
             )
@@ -792,19 +804,15 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
                     atten.head_dim,
                 )
             )
-        # pyre-ignore
         tag_quant_io(
             builder_exported_to_edge.edge_manager.exported_program().graph_module,
-            partial(get_custom_quant_ios_dtype, cache_shape),  # pyre-ignore
+            partial(get_custom_quant_ios_dtype, cache_shape),
         )
 
     logging.info("Lowering model using following partitioner(s): ")
     for partitioner in partitioners:
         logging.info(f"--> {partitioner.__class__.__name__}")
 
-    additional_passes = []
-    if args.model in TORCHTUNE_DEFINED_MODELS:
-        additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
     if args.generate_etrecord:
         if not builder_exported_to_edge.edge_manager:
             raise ValueError("Unable to generate etrecord due to missing edge manager.")
@@ -818,7 +826,6 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         if args.num_sharding > 0 and args.qnn:
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
-            # pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
             canonicalize_program(builder.edge_manager.exported_program())
 
         builder = builder.to_executorch(
@@ -840,11 +847,55 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         if args.num_sharding > 0 and args.qnn:
             from executorch.backends.qualcomm.utils.utils import canonicalize_program
 
-            # pyre-fixme[16]: Module `backends` has no attribute `qualcomm`.
             canonicalize_program(builder.edge_manager.exported_program())
 
         builder = builder.to_executorch(passes=additional_passes)
 
+    return builder
+
+
+def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
+    _validate_args(args)
+
+    pt2e_quant_params, quantizers, quant_dtype = get_quantizer_and_quant_params(args)
+
+    additional_passes = []
+    if args.model in TORCHTUNE_DEFINED_MODELS:
+        additional_passes = [InitializedMutableBufferPass(["kv_cache_pos"])]
+
+    # export_to_edge
+    builder_exported = _prepare_for_llama_export(args).export()
+    builder_exported.run_canonical_optimizations()
+    modelname = builder_exported.modelname
+
+    if args.export_only:
+        exit()
+
+    if pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None:
+        # Force xnnpack to be true if pt2e_quant_params is not None and args.xnnpack is False
+        args.xnnpack = True
+
+    if args.xnnpack:
+        builder = _to_edge_and_lower_llama_xnnpack(
+            builder_exported,
+            modelname,
+            additional_passes,
+            pt2e_quant_params,
+            quantizers,
+            quant_dtype,
+            args,
+        )
+    else:
+        builder = _to_edge_and_lower_llama(
+            builder_exported,
+            modelname,
+            additional_passes,
+            pt2e_quant_params,
+            quantizers,
+            quant_dtype,
+            args,
+        )
+
     if args.profile_memory:
         generate_memory_trace(builder.export_program, "memory_profile.json")
 
@@ -866,7 +917,6 @@ def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
         output_file = f"{builder.output_dir}/{modelname}.pte"
 
     builder.save_to_pte(output_file)
-
     return builder
 
 
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -14,8 +14,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from executorch.backends.vulkan._passes import VkInt4WeightOnlyQuantizer
-
 from executorch.extension.llm.export.builder import DType
 
 from sentencepiece import SentencePieceProcessor
@@ -180,6 +178,8 @@ def quantize(  # noqa C901
         model = gptq_quantizer.quantize(model, inputs)
         return model
     elif qmode == "vulkan_4w":
+        from executorch.backends.vulkan._passes import VkInt4WeightOnlyQuantizer
+
         q_group_size = 256 if group_size is None else group_size
         model = VkInt4WeightOnlyQuantizer(groupsize=q_group_size).quantize(model)
 
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -67,7 +67,6 @@ def export(self) -> "LlavaEdgeManager":
                 dynamic_shapes=dynamic_shape,
                 strict=False,
             )
-            # pyre-ignore: Incompatible attribute type [8]: Attribute `pre_autograd_graph_module` declared in class `LLMEdgeManager` has type `Optional[GraphModule]` but is used as type `Module`.
             self.pre_autograd_graph_module = self.export_program.module()
         return self
 
diff --git a/exir/dialects/edge/test/TARGETS b/exir/dialects/edge/test/TARGETS
@@ -10,7 +10,7 @@ python_unittest(
     resources = {
         "//executorch/exir/dialects/edge:edge_yaml": "edge.yaml",
     },
-    tags = ["long_running"],
+    labels = ["long_running"],
     deps = [
         "fbsource//third-party/pypi/expecttest:expecttest",  # @manual
         "//caffe2:torch",
diff --git a/extension/android_test/setup.sh b/extension/android_test/setup.sh
@@ -11,7 +11,7 @@ BUILD_AAR_DIR="$(mktemp -d)"
 export BUILD_AAR_DIR
 
 BASEDIR=$(dirname "$0")
-source "$BASEDIR"/../../build/build_android_llm_demo.sh
+source "$BASEDIR"/../../build/build_android_library.sh
 
 build_native_library() {
   ANDROID_ABI="$1"
diff --git a/extension/benchmark/android/benchmark/README.md b/extension/benchmark/android/benchmark/README.md
@@ -15,7 +15,7 @@ Minibench is usedful for giving reference performance data when developers integ
 You will need executorch AAR for Java and JNI dependencies.
 ```
 export ANDROID_NDK=<path_to_android_ndk>
-sh build/build_android_llm_demo.sh
+sh build/build_android_library.sh
 ```
 and copy the AAR to `app/libs`.
 ```
diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
@@ -44,7 +44,7 @@ protected void onCreate(Bundle savedInstanceState) {
             .get();
 
     int numIter = intent.getIntExtra("num_iter", 50);
-    int numWarmupIter = intent.getIntExtra("num_warm_up_iter", 5);
+    int numWarmupIter = intent.getIntExtra("num_warm_up_iter", 10);
 
     // TODO: Format the string with a parsable format
     Stats stats = new Stats();
diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py
diff --git a/kernels/optimized/cpu/op_gelu.cpp b/kernels/optimized/cpu/op_gelu.cpp
diff --git a/setup.py b/setup.py
diff --git a/third-party/ao b/third-party/ao

Original file line number	Diff line number	Diff line change
`@@ -121,4 +121,3 @@ target_link_libraries(`
`121`	`121`	```
`122`	`122`
`123`	`123`	`No additional steps are necessary to use the backend beyond linking the target. Any XNNPACK-delegated .pte file will automatically run on the registered backend.`
`124`		`-`
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,6 @@ def export(self) -> "LlavaEdgeManager":`
`67`	`67`	`dynamic_shapes=dynamic_shape,`
`68`	`68`	`strict=False,`
`69`	`69`	`)`
`70`		- # pyre-ignore: Incompatible attribute type [8]: Attribute `pre_autograd_graph_module` declared in class `LLMEdgeManager` has type `Optional[GraphModule]` but is used as type `Module`.
`71`	`70`	`self.pre_autograd_graph_module = self.export_program.module()`
`72`	`71`	`return self`
`73`	`72`