pytorch
diff --git a/‎.ci/scripts/build_llama_android.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/build_llama_android.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/models/llama2/README.md
Lines changed: 8 additions & 5 deletions b/‎examples/models/llama2/README.md
Lines changed: 8 additions & 5 deletions
diff --git a/‎examples/models/llama2/builder.py
Lines changed: 4 additions & 0 deletions b/‎examples/models/llama2/builder.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/models/llama2/export_llama_lib.py
Lines changed: 15 additions & 3 deletions b/‎examples/models/llama2/export_llama_lib.py
Lines changed: 15 additions & 3 deletions
@@ -26,6 +26,7 @@ install_executorch_and_backend_lib() {
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
     -DEXECUTORCH_BUILD_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_QUANTIZED=ON \
     -DXNNPACK_ENABLE_ARM_BF16=OFF \
     -Bcmake-android-out .
 
 
@@ -5,7 +5,7 @@ This example demonstrates how to run a [Llama 2](https://ai.meta.com/llama/) 7B
 For Llama2, please refer to [the llama's github page](https://github.com/facebookresearch/llama) for details.
 Pretrained parameters are not included in this repo. Users are suggested to download them through [the llama's download page](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
 
-# What is Llama 2?
+# What are Llama 2 and 3?
 Llama is a family of large language models that uses publicly available data for training. These models are based on the transformer architecture, which allows it to process input sequences of arbitrary length and generate output sequences of variable length. One of the key features of Llama models is its ability to generate coherent and contextually relevant text. This is achieved through the use of attention mechanisms, which allow the model to focus on different parts of the input sequence as it generates output. Additionally, Llama models use a technique called “masked language modeling” to pre-train the model on a large corpus of text, which helps it learn to predict missing words in a sentence.
 
 Llama models have shown to perform well on a variety of natural language processing tasks, including language translation, question answering, and text summarization and are also capable of generating human-like text, making Llama models a useful tool for creative writing and other applications where natural language generation is important.
@@ -17,7 +17,9 @@ Please note that the models are subject to the [acceptable use policy](https://g
 
 # Results
 
-Since 7B Llama2 model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model.
+Since 7B Llama2 model needs at least 4-bit quantization to fit even within some of the highend phones, results presented here correspond to 4-bit groupwise post-training quantized model. 
+
+For Llama3, we can use the same process. Note that it's only supported in the ExecuTorch main branch. 
 
 ## Quantization:
 We employed 4-bit groupwise per token dynamic quantization of all the linear layers of the model. Dynamic quantization refers to quantizating activations dynamically, such that quantization parameters for activations are calculated, from min/max range, at runtime. Here we quantized activations with 8bits (signed integer). Furthermore, weights are statically quantized. In our case weights were per-channel groupwise quantized with 4bit signed integer. For more information refer to this [page](https://github.com/pytorch-labs/ao/).
@@ -230,7 +232,7 @@ adb push cmake-out-android/examples/models/llama2/llama_main /data/local/tmp/lla
 
 **2.3 Run model**
 ```
-adb shell "cd /data/local/tmp/llama && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.bin> --prompt "Once upon a time" --seq_len 120
+adb shell "cd /data/local/tmp/llama && ./llama_main --model_path <model.pte> --tokenizer_path <tokenizer.bin> --prompt \"Once upon a time\" --seq_len 120"
 ```
 ## Step 6: Build Mobile apps
 
@@ -263,12 +265,13 @@ This example tries to reuse the Python code, with minimal modifications to make
 3. No dependencies on fairscale. The ColumnParallelLinear, ParallelEmbedding and training are not needed and supported in ExecuTorch.
 
 
-# Clean
-To clean your build:
+# Common Issues and Mitigations:
+- To clean your build:
 ```
 git clean -xfd
 pip uninstall executorch
 ./install_requirements.sh <options>
 
 rm -rf cmake-out
 ```
+- If you encounter `pthread` related issues during link time, add `pthread` in `target_link_libraries` in `CMakeLists.txt`
@@ -62,6 +62,7 @@ def to_torch_dtype(self) -> torch.dtype:
 
 def load_llama_model(
     *,
+    modelname: str = "llama2",
     checkpoint: Optional[str] = None,
     checkpoint_dir: Optional[str] = None,
     params_path: str,
@@ -114,6 +115,7 @@ def load_llama_model(
 
     return LlamaEdgeManager(
         model=model,
+        modelname=modelname,
         weight_type=weight_type,
         dtype=dtype,
         use_kv_cache=use_kv_cache,
@@ -131,6 +133,7 @@ class LlamaEdgeManager:
     def __init__(
         self,
         model,
+        modelname,
         weight_type,
         dtype,
         use_kv_cache,
@@ -139,6 +142,7 @@ def __init__(
         verbose: bool = False,
     ):
         self.model = model
+        self.modelname = modelname
         self.weight_type = weight_type
         self.dtype = dtype
         self.example_inputs = example_inputs
 
@@ -38,7 +38,7 @@
 from .builder import DType, LlamaEdgeManager, load_llama_model, WeightType
 from .quant_lib import _get_pt2e_quantization_params, get_pt2e_quantizers
 
-from .quantize import EmbeddingOnlyInt8QuantHandler, WeightOnlyInt8QuantHandler
+from .quantize import EmbeddingQuantHandler, WeightOnlyInt8QuantHandler
 
 
 IS_FBCODE = True  #  os.environ.get("FBCODE_PLATFORM", False)
@@ -145,6 +145,10 @@ def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
 
 
 class SDPASimple(torch.nn.Module):
+    """
+    This is a simpler implementation of SDPA module defined in llama_transformer.py. Notice that it's
+    an implementation including both some preprocessing logic and F.scaled_dot_product_attention.
+    """
     def __init__(
         self,
         kv_cache: KVCache,
@@ -168,6 +172,7 @@ def forward(
         seqlen,
         mask,
     ):
+        # The first few lines are the same as the original SDPA module.
         q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
@@ -177,6 +182,11 @@ def forward(
 
         k = k.repeat_interleave(self.n_rep, dim=1)
         v = v.repeat_interleave(self.n_rep, dim=1)
+
+        # Following is the different part. Instead of calling F.scaled_dot_product_attention,
+        # we use the following implementation to avoid the decomposition from F.scaled_dot_product_attention,
+        # as the decompostion is too expensive. The following will get rid of aten.full_like, aten.logical_not,
+        # aten.scalar_tensor, aten.where and 2 extra aten.mul.
         scale_factor = 1 / math.sqrt(q.size(-1))
         attn_weight = q @ k.transpose(-2, -1) * scale_factor
         attn_weight += attn_mask
@@ -559,7 +569,6 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
     )
     params_path = canonical_path(args.params)
     output_dir_path = canonical_path(args.output_dir, dir=True)
-    modelname = "llama2"
     weight_type = WeightType.FAIRSEQ2 if args.fairseq2 else WeightType.LLAMA
 
     # dtype override
@@ -613,7 +622,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
             group_size = int(group_size)
         bitwidth = int(bitwidth)
         transforms.append(
-            lambda model: EmbeddingOnlyInt8QuantHandler(
+            lambda model: EmbeddingQuantHandler(
                 model, bitwidth=bitwidth, group_size=group_size
             ).quantized_model()
         )
@@ -626,6 +635,7 @@ def _prepare_for_llama_export(modelname: str, args) -> LlamaEdgeManager:
 
     return (
         load_llama_model(
+            modelname=modelname,
             checkpoint=checkpoint_path,
             checkpoint_dir=checkpoint_dir,
             params_path=params_path,
@@ -673,6 +683,8 @@ def _export_llama(modelname, args) -> str:  # noqa: C901
         modelname, args
     ).export_to_edge(quantizers)
 
+    modelname = builder_exported_to_edge.modelname
+
     # to_backend
     partitioners = []
     if pt2e_quant_params is not None and pt2e_quant_params.quantize_linear is not None: