pytorch
diff --git a/‎.github/workflows/android.yml
Lines changed: 60 additions & 0 deletions b/‎.github/workflows/android.yml
Lines changed: 60 additions & 0 deletions
diff --git a/‎build/test_android_ci.sh
Lines changed: 1 addition & 0 deletions b/‎build/test_android_ci.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/llm/getting-started.md
Lines changed: 119 additions & 23 deletions b/‎docs/source/llm/getting-started.md
Lines changed: 119 additions & 23 deletions
diff --git a/‎examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
Lines changed: 1 addition & 1 deletion b/‎examples/demo-apps/android/LlamaDemo/app/src/androidTest/java/com/example/executorchllamademo/PerfTest.java
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama2/README.md
Lines changed: 1 addition & 1 deletion b/‎examples/models/llama2/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama2/runner/runner.cpp
Lines changed: 5 additions & 4 deletions b/‎examples/models/llama2/runner/runner.cpp
Lines changed: 5 additions & 4 deletions
diff --git a/‎examples/models/llama2/runner/runner.h
Lines changed: 1 addition & 1 deletion b/‎examples/models/llama2/runner/runner.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama2/tokenizer/tokenizer.cpp renamed to ‎examples/models/llama2/tokenizer/bpe_tokenizer.cpp
Lines changed: 10 additions & 10 deletions b/‎examples/models/llama2/tokenizer/tokenizer.cpp renamed to ‎examples/models/llama2/tokenizer/bpe_tokenizer.cpp
Lines changed: 10 additions & 10 deletions
@@ -33,6 +33,7 @@ jobs:
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 90
+      upload-artifact: android-apps
       script: |
         set -eux
 
@@ -45,3 +46,62 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
         # Build Android demo app
         bash build/test_android_ci.sh
+
+        mkdir -p artifacts-to-be-uploaded
+        # Copy the app and its test suite to S3
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/debug/*.apk artifacts-to-be-uploaded/
+        cp examples/demo-apps/android/LlamaDemo/app/build/outputs/apk/androidTest/debug/*.apk artifacts-to-be-uploaded/
+        # Also copy the share libraries
+        cp cmake-out-android/lib/*.a artifacts-to-be-uploaded/
+
+  # Upload the app and its test suite to S3 so that they can be downloaded by the test job
+  upload-artifacts:
+    needs: test-demo-android
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the artifacts
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: android-apps
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the artifacts
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the artifacts to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
+
+  # Let's see how expensive this job is, we might want to tone it down by running it periodically
+  test-llama-app:
+    needs: upload-artifacts
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
+    with:
+      device-type: android
+      runner: ubuntu-latest
+      test-infra-ref: ''
+      # This is the ARN of ExecuTorch project on AWS
+      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
+      # This is the custom Android device pool that only includes Samsung Galaxy S2x
+      device-pool-arn: arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa
+      # Uploaded to S3 from the previous job, the name of the app comes from the project itself
+      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/app-debug.apk
+      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/app-debug-androidTest.apk
+      # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
+      test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/414cb54d-4d83-4576-8317-93244e4dc50e
+      # The exported llama2 model and its tokenizer, can be downloaded from https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b.zip.
+      # Among the input, this is the biggest file and uploading it to AWS beforehand makes the test run much faster
+      extra-data: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/bd15825b-ddab-4e47-9fef-a9c8935778dd
@@ -32,6 +32,7 @@ build_android_llama_demo_app() {
   pushd examples/demo-apps/android/LlamaDemo
   ANDROID_NDK=/opt/ndk ANDROID_ABI=arm64-v8a ./gradlew setup
   ANDROID_HOME=/opt/android/sdk ./gradlew build
+  ANDROID_HOME=/opt/android/sdk ./gradlew assembleAndroidTest
   popd
 }
 
 
@@ -374,46 +374,102 @@ specific hardware (delegation), and because it is doing all of the calculations
 
 ## Delegation
 
-While ExecuTorch provides a portable, cross-platform implementation for all operators, it also provides specialized
-backends for a number of different targets. These include, but are not limited to, x86 and ARM CPU acceleration via
-the XNNPACK backend, Apple acceleration via the CoreML backend and Metal Performance Shader (MPS) backend, and GPU
-acceleration via the Vulkan backend.
-
-Because optimizations are specific to a given backend, each pte file is specific to the backend(s) targeted at
-export. To support multiple devices, such as XNNPACK acceleration for Android and CoreML for iOS, export a separate
-PTE file for each backend.
-
-To delegate to a backend at export time, ExecuTorch provides the `to_backend()` function, which takes a backend-
-specific partitioner object. The partitioner is responsible for finding parts of the computation graph that can
-be accelerated by the target backend. Any portions of the computation graph not delegated will be executed by the
-portable or optimized ExecuTorch implementations.
-
-To delegate to the XNNPACK backend, call `to_backend` with an instance of `XnnpackPartitioner()`.
+While ExecuTorch provides a portable, cross-platform implementation for all
+operators, it also provides specialized backends for a number of different
+targets. These include, but are not limited to, x86 and ARM CPU acceleration via
+the XNNPACK backend, Apple acceleration via the CoreML backend and Metal
+Performance Shader (MPS) backend, and GPU acceleration via the Vulkan backend.
+
+Because optimizations are specific to a given backend, each pte file is specific
+to the backend(s) targeted at export. To support multiple devices, such as
+XNNPACK acceleration for Android and CoreML for iOS, export a separate PTE file
+for each backend.
+
+To delegate to a backend at export time, ExecuTorch provides the `to_backend()`
+function in the `EdgeProgramManager` object, which takes a backend-specific
+partitioner object. The partitioner is responsible for finding parts of the
+computation graph that can be accelerated by the target backend，and
+`to_backend()` function will delegate matched part to given backend for
+acceleration and optimization. Any portions of the computation graph not
+delegated will be executed by the ExecuTorch operator implementations.
+
+To delegate the exported model to the specific backend, we need to import its
+partitioner as well as edge compile config from ExecuTorch Codebase first, then
+call `to_backend` with an instance of partitioner on the `EdgeProgramManager`
+object `to_edge` function created.
+
+Here's an example of how to delegate NanoGPT to XNNPACK (if you're deploying to an Android Phone for instance):
 
 ```python
 # export_nanogpt.py
 
+# Load partitioner for Xnnpack backend
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
+
+# Model to be delegated to specific backend should use specific edge compile config
 from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config
+from executorch.exir import EdgeCompileConfig, to_edge
+
+import torch
+from torch.export import export
+from torch.nn.attention import sdpa_kernel, SDPBackend
+from torch._export import capture_pre_autograd_graph
+
+from model import GPT
+
+# Load the NanoGPT model.
+model = GPT.from_pretrained('gpt2')
 
-#...
+# Create example inputs. This is used in the export process to provide
+# hints on the expected shape of the model input.
+example_inputs = (
+        torch.randint(0, 100, (1, 8), dtype=torch.long),
+    )
+
+# Trace the model, converting it to a portable intermediate representation.
+# The torch.no_grad() call tells PyTorch to exclude training-specific logic.
+with torch.nn.attention.sdpa_kernel([SDPBackend.MATH]), torch.no_grad():
+    m = capture_pre_autograd_graph(model, example_inputs)
+    traced_model = export(m, example_inputs)
 
+# Convert the model into a runnable ExecuTorch program.
+# To be further lowered to Xnnpack backend, `traced_model` needs xnnpack-specific edge compile config
 edge_config = get_xnnpack_edge_compile_config()
 edge_manager = to_edge(traced_model, compile_config=edge_config)
 
-# Delegate to the XNNPACK backend.
+# Delegate exported model to Xnnpack backend by invoking `to_backend` function with Xnnpack partitioner.
 edge_manager = edge_manager.to_backend(XnnpackPartitioner())
-
 et_program = edge_manager.to_executorch()
 
+# Save the Xnnpack-delegated ExecuTorch program to a file.
+with open("nanogpt.pte", "wb") as file:
+    file.write(et_program.buffer)
+
+
 ```
 
-Additionally, update CMakeLists.txt to build and link the XNNPACK backend.
+Additionally, update CMakeLists.txt to build and link the XNNPACK backend to
+ExecuTorch runner.
 
 ```
-option(EXECUTORCH_BUILD_XNNPACK "" ON)
+cmake_minimum_required(VERSION 3.19)
+project(nanogpt_runner)
 
-# ...
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED True)
+
+# Set options for executorch build.
+option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "" ON)
+option(EXECUTORCH_BUILD_EXTENSION_MODULE "" ON)
+option(EXECUTORCH_BUILD_OPTIMIZED "" ON)
+option(EXECUTORCH_BUILD_XNNPACK "" ON) # Build with Xnnpack backend
+
+# Include the executorch subdirectory.
+add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/executorch
+    ${CMAKE_BINARY_DIR}/executorch)
+
+# include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 
 add_executable(nanogpt_runner main.cpp)
 target_link_libraries(
@@ -423,11 +479,51 @@ target_link_libraries(
     extension_module_static # Provides the Module class
     optimized_native_cpu_ops_lib # Provides baseline cross-platform kernels
     xnnpack_backend) # Provides the XNNPACK CPU acceleration backend
+```
+
+Keep the rest of the code the same. For more details refer to
+[Exporting to ExecuTorch](https://pytorch.org/executorch/main/llm/getting-started.html#step-1-exporting-to-executorch)
+and
+[Invoking the Runtime](https://pytorch.org/executorch/main/llm/getting-started.html#step-2-invoking-the-runtime)
+for more details
 
+At this point, the working directory should contain the following files:
+
+- CMakeLists.txt
+- main.cpp
+- basic_tokenizer.h
+- basic_sampler.h
+- managed_tensor.h
+- export_nanogpt.py
+- model.py
+- vocab.json
+
+If all of these are present, you can now export Xnnpack delegated pte model:
+```bash
+python export_nanogpt.py
 ```
 
-For more information, see the ExecuTorch guides for the [XNNPACK Backend](https://pytorch.org/executorch/stable/tutorial-xnnpack-delegate-lowering.html)
-and [CoreML Backend](https://pytorch.org/executorch/stable/build-run-coreml.html).
+It will generate `nanogpt.pte`, under the same working directory.
+
+Then we can build and run the model by:
+```bash
+(rm -rf cmake-out && mkdir cmake-out && cd cmake-out && cmake ..)
+cmake --build cmake-out -j10
+./cmake-out/nanogpt_runner
+```
+
+You should see something like the following:
+
+```
+Once upon a time, there was a man who was a member of the military...
+```
+
+
+For more information regarding backend delegateion, see the ExecuTorch guides
+for the
+[XNNPACK Backend](https://pytorch.org/executorch/stable/tutorial-xnnpack-delegate-lowering.html)
+and
+[CoreML Backend](https://pytorch.org/executorch/stable/build-run-coreml.html).
 
 ## Quantization
 
 
@@ -28,7 +28,7 @@ public class PerfTest implements LlamaCallback {
   private static final String TOKENIZER_BIN = "tokenizer.bin";
 
   // From https://github.com/pytorch/executorch/blob/main/examples/models/llama2/README.md
-  private static final Float EXPECTED_TPS = 7.0F;
+  private static final Float EXPECTED_TPS = 10.0F;
 
   private final List<String> results = new ArrayList<>();
   private final List<Float> tokensPerSecond = new ArrayList<>();
 
@@ -20,7 +20,7 @@ Please note that the models are subject to the [acceptable use policy](https://g
 Since 7B Llama2 model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model.
 
 ## Quantization:
-We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html).
+We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://github.com/pytorch-labs/ao/).
 
 We evaluated UncycloText perplexity using [LM Eval](https://github.com/EleutherAI/lm-evaluation-harness). Below are the results for two different groupsizes.
 
 
@@ -10,6 +10,7 @@
 // The module takes in a string as input and emits a string as output.
 
 #include <executorch/examples/models/llama2/runner/runner.h>
+#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 #include <executorch/extension/evalue_util/print_evalue.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 
@@ -76,7 +77,7 @@ Error Runner::load() {
   append_eos_ = getMetadataHelper("append_eos_to_prompt", false);
 
   // Load tokenizer
-  tokenizer_ = std::make_unique<Tokenizer>(vocab_size_, bos_id_, eos_id_);
+  tokenizer_ = std::make_unique<BPETokenizer>(vocab_size_, bos_id_, eos_id_);
   tokenizer_->load(tokenizer_path_);
   if (tokenizer_->bos_tok() != bos_id_) {
     ET_LOG(
@@ -105,7 +106,7 @@ Error Runner::load() {
 }
 
 template <typename T>
-T Runner::getMetadataHelper(std::string method_name, T default_val) {
+T Runner::getMetadataHelper(const std::string& method_name, T default_val) {
   T res = default_val;
   if (model_methods_.count(method_name)) {
     Result<std::vector<EValue>> outputs = module_->execute(method_name);
@@ -484,9 +485,9 @@ void Runner::stop() {
 
 // explicit instantiation of template methods
 template int64_t Runner::getMetadataHelper<int64_t>(
-    std::string method_name,
+    const std::string& method_name,
     int64_t default_val);
 template bool Runner::getMetadataHelper<bool>(
-    std::string method_name,
+    const std::string& method_name,
     bool default_val);
 } // namespace torch::executor
@@ -69,7 +69,7 @@ class Runner {
  private:
   // metadata
   template <typename T>
-  T getMetadataHelper(std::string method_name, T default_val);
+  T getMetadataHelper(const std::string& method_name, T default_val);
   template <typename T>
   int32_t
   logitsToToken(const exec_aten::Tensor& logits_tensor, int64_t pos, T _);
 
@@ -6,7 +6,7 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-#include <executorch/examples/models/llama2/tokenizer/tokenizer.h>
+#include <executorch/examples/models/llama2/tokenizer/bpe_tokenizer.h>
 
 #include <string>
 
@@ -23,11 +23,11 @@ static int compare_tokens(const void* a, const void* b) {
   return strcmp(((TokenIndex*)a)->str, ((TokenIndex*)b)->str);
 }
 
-Tokenizer::Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
-    : initialized_(false),
-      vocab_size_(vocab_size),
-      bos_tok_(bos_tok),
-      eos_tok_(eos_tok),
+BPETokenizer::BPETokenizer(
+    int32_t vocab_size,
+    uint64_t bos_tok,
+    uint64_t eos_tok)
+    : Tokenizer(vocab_size, bos_tok, eos_tok),
       vocab_(std::make_unique<char*[]>(vocab_size)),
       vocab_scores_(std::make_unique<float[]>(vocab_size)),
       sorted_vocab_(std::make_unique<TokenIndex[]>(vocab_size)) {
@@ -47,7 +47,7 @@ Tokenizer::Tokenizer(int32_t vocab_size, uint64_t bos_tok, uint64_t eos_tok)
  * @param tokenizer_path The path to the tokenizer file.
  * @return Error
  */
-Error Tokenizer::load(const std::string& tokenizer_path) {
+Error BPETokenizer::load(const std::string& tokenizer_path) {
   if (initialized_) {
     ET_LOG(Info, "Tokenizer already initialized");
     return Error::Ok;
@@ -131,7 +131,7 @@ Error Tokenizer::load(const std::string& tokenizer_path) {
   return Error::Ok;
 }
 
-Tokenizer::~Tokenizer() {
+BPETokenizer::~BPETokenizer() {
   for (int i = 0; i < vocab_size_; i++) {
     delete[] vocab_[i];
   }
@@ -145,7 +145,7 @@ Tokenizer::~Tokenizer() {
  * @return Result<std::string> A pointer to the string representation of the
  * token.
  */
-Result<std::string> Tokenizer::decode(uint64_t prev_token, uint64_t token) {
+Result<std::string> BPETokenizer::decode(uint64_t prev_token, uint64_t token) {
   if (!initialized_) {
     ET_LOG(Error, "Tokenizer not initialized");
     return Error::NotSupported;
@@ -187,7 +187,7 @@ str_lookup(const char* str, TokenIndex* sorted_vocab, int32_t vocab_size) {
  * @return Result<std::vector<uint64_t>>
  */
 Result<std::vector<uint64_t>>
-Tokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
+BPETokenizer::encode(const std::string& text, int8_t bos, int8_t eos) {
   if (!initialized_) {
     ET_LOG(Error, "Tokenizer not initialized");
     return Error::NotSupported;
Original file line number	Diff line number	Diff line change
`@@ -32,6 +32,7 @@ build_android_llama_demo_app() {`
`32`	`32`	`pushd examples/demo-apps/android/LlamaDemo`
`33`	`33`	`ANDROID_NDK=/opt/ndk ANDROID_ABI=arm64-v8a ./gradlew setup`
`34`	`34`	`ANDROID_HOME=/opt/android/sdk ./gradlew build`
	`35`	`+ ANDROID_HOME=/opt/android/sdk ./gradlew assembleAndroidTest`
`35`	`36`	`popd`
`36`	`37`	`}`
`37`	`38`