pytorch
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 3 additions & 2 deletions b/‎.ci/scripts/utils.sh
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/workflows/lint.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/lint.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/_passes/i64_to_i32.py
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/_passes/i64_to_i32.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/_passes/lift_constant_scalar_operands.py
Lines changed: 2 additions & 1 deletion b/‎backends/qualcomm/_passes/lift_constant_scalar_operands.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/qualcomm/_passes/replace_inf_values.py
Lines changed: 6 additions & 0 deletions b/‎backends/qualcomm/_passes/replace_inf_values.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/qualcomm/builders/op_cum_sum.py
Lines changed: 2 additions & 0 deletions b/‎backends/qualcomm/builders/op_cum_sum.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/qualcomm/tests/models.py
Lines changed: 10 additions & 10 deletions b/‎backends/qualcomm/tests/models.py
Lines changed: 10 additions & 10 deletions
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py
Lines changed: 64 additions & 3 deletions b/‎backends/qualcomm/tests/test_qnn_delegate.py
Lines changed: 64 additions & 3 deletions
diff --git a/‎backends/qualcomm/tests/utils.py
Lines changed: 1 addition & 0 deletions b/‎backends/qualcomm/tests/utils.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
Lines changed: 15 additions & 20 deletions b/‎backends/xnnpack/_passes/channels_last_tagged_reshape_pass.py
Lines changed: 15 additions & 20 deletions
diff --git a/‎backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
Lines changed: 47 additions & 0 deletions b/‎backends/xnnpack/test/passes/test_channels_last_tagged_reshape.py
Lines changed: 47 additions & 0 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/CMakeLists.txt
Lines changed: 2 additions & 0 deletions b/‎examples/qualcomm/oss_scripts/llama/CMakeLists.txt
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/README.md
Lines changed: 15 additions & 3 deletions b/‎examples/qualcomm/oss_scripts/llama/README.md
Lines changed: 15 additions & 3 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/TARGETS
Lines changed: 11 additions & 0 deletions b/‎examples/qualcomm/oss_scripts/llama/TARGETS
Lines changed: 11 additions & 0 deletions
@@ -156,13 +156,14 @@ build_executorch_runner() {
 }
 
 cmake_install_executorch_lib() {
+  build_type="${1:-Release}"
   echo "Installing libexecutorch.a and libportable_kernels.a"
   clean_executorch_install_folders
   retry cmake -DCMAKE_INSTALL_PREFIX=cmake-out \
-          -DCMAKE_BUILD_TYPE=Release \
+          -DCMAKE_BUILD_TYPE=${build_type} \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
-  cmake --build cmake-out -j9 --target install --config Release
+  cmake --build cmake-out -j9 --target install --config ${build_type}
 }
 
 download_stories_model_artifacts() {
 
@@ -46,7 +46,7 @@ jobs:
         fi
 
         # This has already been cached in the docker image
-        lintrunner init 2> /dev/null
+        lintrunner init
 
         RC=0
         # Run lintrunner on all files
 
@@ -28,8 +28,10 @@ class I64toI32(ExportPass):
     I64_OPS = {
         exir_ops.edge.aten.argmin.default,
         exir_ops.edge.aten.arange.start_step,
+        exir_ops.edge.aten.cumsum.default,
         exir_ops.edge.aten.full.default,
         exir_ops.edge.aten.scalar_tensor.default,
+        exir_ops.edge.dim_order_ops._to_dim_order_copy.default,
     }
     # This dict is to ensure that the input of the OPs are int64 due to Pytorch restrictions.
     # For example, scatter op can only accept args[2], the index, as int64.
 
@@ -86,7 +86,8 @@ def _build_tensor_constant(
             dtype=(
                 node.args[0].meta["val"].dtype
                 if not is_float_tensor(node)
-                and not SCALAR_OPS.get(node.target).use_self_dtype
+                and (info := SCALAR_OPS.get(node.target))
+                and not info.use_self_dtype
                 else node.meta["val"].dtype
             ),
             device=node.meta["val"].device,
 
@@ -30,6 +30,12 @@ def call(self, graph_module: torch.fx.GraphModule):
                     arg_list[index] = torch.finfo(torch.float32).min
                 elif arg == float("inf"):
                     arg_list[index] = torch.finfo(torch.float32).max
+
+            if node.target == torch.ops.aten.masked_fill.Scalar:
+                if arg_list[2] == torch.finfo(torch.float32).max:
+                    arg_list[2] = 255
+                elif arg_list[2] == torch.finfo(torch.float32).min:
+                    arg_list[2] = -255
             node.args = tuple(arg_list)
 
         graph_module.recompile()
 
@@ -51,6 +51,8 @@ def define_node(
         dim = self.get_param(node, input_tensor)
 
         output_tensor = self.get_tensor(node, node)
+        if output_tensor.dtype == torch.int64:
+            output_tensor = output_tensor.to(torch.int32)
         output_tensor_wrapper = self.define_tensor(
             node,
             node,
 
@@ -1101,6 +1101,16 @@ def forward(self, x):
         return torch.mean(x, (-1, -2))
 
 
+class MaskedFill(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, attn_mask):
+        return attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+
+
 class Maximum(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -1751,16 +1761,6 @@ def forward(self, x):
         )
 
 
-class MaskedFill(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-
-    def forward(self, attn_mask):
-        return attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
-            attn_mask == 0, float(0.0)
-        )
-
-
 # Mimi Decoder has 0D tensor which QNN cannot handle.
 class ZeroDimTensor(torch.nn.Module):
     def __init__(self):
 
@@ -272,9 +272,24 @@ def test_qnn_backend_cos(self):
         self.lower_module_and_test_output(module, sample_input)
 
     def test_qnn_backend_cumsum(self):
-        module = CumSum()  # noqa: F405
-        sample_input = (torch.randn(4),)
-        self.lower_module_and_test_output(module, sample_input)
+        sample_input = ()
+        test_comb = [
+            {
+                QCOM_MODULE: [CumSum()],  # noqa: F405
+                QCOM_SAMPLE_INPUTS: [
+                    (torch.randn(4),),
+                    (torch.randint(0, 10, size=(4,)),),
+                ],
+            }
+        ]
+
+        index = 0
+        for comb in test_comb:
+            for module in comb[QCOM_MODULE]:
+                for sample_input in comb[QCOM_SAMPLE_INPUTS]:
+                    with self.subTest(i=index):
+                        self.lower_module_and_test_output(module, sample_input)
+                        index += 1
 
     def test_qnn_backend_einsum_outer_product(self):
         module = EinsumOuterProduct()  # noqa: F405
@@ -311,6 +326,12 @@ def test_qnn_backend_element_wise_add(self):
                 QCOM_MODULE: [AddConstantFloat()],  # noqa: F405
                 QCOM_SAMPLE_INPUTS: [(torch.randn(2, 5, 1, 3),)],
             },
+            {
+                QCOM_MODULE: [
+                    AddConstantLong(),  # noqa: F405
+                ],
+                QCOM_SAMPLE_INPUTS: [(torch.randint(0, 10, size=(2, 3)),)],
+            },
         ]
 
         index = 0
@@ -4526,6 +4547,40 @@ def test_retinanet(self):
             else:
                 self.assertGreaterEqual(msg["mAP"], 0.6)
 
+    def test_roberta(self):
+        if not self.required_envs([self.sentence_dataset]):
+            self.skipTest("missing required envs")
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/roberta.py",
+            "--dataset",
+            self.sentence_dataset,
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                self.assertGreaterEqual(msg["accuracy"], 0.5)
+
     def test_squeezenet(self):
         if not self.required_envs([self.image_dataset]):
             self.skipTest("missing required envs")
@@ -5344,6 +5399,11 @@ def setup_environment():
         help="Location for imagenet dataset",
         type=str,
     )
+    parser.add_argument(
+        "--sentence_dataset",
+        help="Location for sentence dataset",
+        type=str,
+    )
     parser.add_argument(
         "-p",
         "--pretrained_weight",
@@ -5402,6 +5462,7 @@ def setup_environment():
     TestQNN.executorch_root = args.executorch_root
     TestQNN.artifact_dir = args.artifact_dir
     TestQNN.image_dataset = args.image_dataset
+    TestQNN.sentence_dataset = args.sentence_dataset
     TestQNN.pretrained_weight = args.pretrained_weight
     TestQNN.model_name = args.model_name
     TestQNN.online_prepare = args.online_prepare
 
@@ -183,6 +183,7 @@ class TestQNN(unittest.TestCase):
     executorch_root: str = ""
     artifact_dir: str = ""
     image_dataset: str = ""
+    sentence_dataset: str = ""
     pretrained_weight: str = ""
     enable_profile: bool = False
     op_package_dir: str = ""
 
@@ -91,18 +91,10 @@ def is_nchw_node(self, node: torch.fx.Node) -> bool:
         return not self.is_nhwc_node(node)
 
     def requires_nhwc_input(self, node: torch.fx.Node) -> bool:
-        return (
-            node.target in self.memory_sensitive_ops_nhwc
-            or node.name == "output"
-            and not node.args[0][0].meta["val"].is_contiguous()
-        )
+        return node.target in self.memory_sensitive_ops_nhwc
 
     def requires_nchw_inputs(self, node: torch.fx.Node) -> bool:
-        return (
-            node.target in self.memory_sensitive_ops_nchw
-            or node.name == "output"
-            and node.args[0][0].meta["val"].is_contiguous()
-        )
+        return node.target in self.memory_sensitive_ops_nchw
 
     def can_be_converted_to_nhwc(self, node: torch.fx.Node) -> bool:
         # There are two conditions that must be met for a node to be able to
@@ -380,18 +372,21 @@ def call(self, graph_module: torch.fx.GraphModule):  # noqa: C901
                 # This node has no inputs so we don't need to change anything
                 continue
 
-            if self.requires_nhwc_input(node):
+            # Need special case for output node because it can have multiple output dim orders as we can output a tuple multiple nodes
+            if node.op == "output":
+                out_tuple = node.args[0]
+                for out_node in out_tuple:
+                    if out_node.meta["val"].is_contiguous():
+                        self.input_to_nchw(graph_module, out_node, node)
+                    else:
+                        self.input_to_nhwc(graph_module, out_node, node)
+            elif self.requires_nhwc_input(node):
                 # Nodes which enter this branch are ones that require their
                 # first input to be nhwc. This makes this node's output nhwc too
-                # Currently, all nodes like this should have all of their other
-                # inputs as nchw, so fail if this is not true
-                if node.name == "output":
-                    self.input_to_nhwc(graph_module, node.args[0][0], node)
-                else:
-                    self.input_to_nhwc(graph_module, node.args[0], node)
-
-                for input_node in node.all_input_nodes[1:]:
-                    if self.is_nhwc_node(input_node):
+
+                self.input_to_nhwc(graph_module, node.args[0], node)
+                for input_node in node.all_input_nodes:
+                    if input_node.op == "placeholder" and self.is_nhwc_node(input_node):
                         raise AssertionError(
                             f"Expected {input_node} to be NCHW in channels last reshape pass"
                         )
 
@@ -335,3 +335,50 @@ def test_dq_conv2d_channels_last_tagged_reshape_pass(self) -> None:
             )
             .run_method_and_compare_outputs()
         )
+
+    class ConvAddConvOutput(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 16, 3)
+            self.conv2 = torch.nn.Conv2d(16, 16, 3)
+
+        def forward(self, x):
+            y = self.conv1(x)
+            z = torch.add(y, 1.0)
+            out1 = self.conv2(z)
+            out2 = z
+            return out1, out2
+
+    ConvAddConvOutputModule = ConvAddConvOutput()
+
+    def test_conv_add_conv_output(self):
+        x = torch.randn(1, 3, 8, 8)
+
+        self.run_tester(self.ConvAddConvOutput().eval(), (x,))
+
+        x_cl = x.to(memory_format=torch.channels_last)
+        self.run_tester(self.ConvAddConvOutput().eval(), (x_cl,))
+
+    class ThreeOutputsModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv1 = torch.nn.Conv2d(3, 3, 3)
+            self.conv2 = torch.nn.Conv2d(3, 3, 3)
+            self.linear = torch.nn.Linear(6, 6)
+
+        def forward(self, x):
+            conv1_out = self.conv1(x)
+            conv2_out = self.conv2(x)
+            linear_out = self.linear(x)
+
+            return linear_out, conv1_out, conv2_out
+
+    ThreeOutputsModelModule = ThreeOutputsModel()
+
+    def test_three_outputs_model(self):
+        x = torch.randn(1, 3, 6, 6)
+
+        self.run_tester(self.ThreeOutputsModelModule.eval(), (x,))
+
+        x_cl = x.to(memory_format=torch.channels_last)
+        self.run_tester(self.ThreeOutputsModelModule.eval(), (x_cl,))
@@ -36,6 +36,8 @@ list(
   ${CMAKE_CURRENT_LIST_DIR}/runner/token_generator.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/imem_alloc.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/client_mem.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/lhd_token_generator.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/lhd_token_generator.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.cpp
   ${CMAKE_CURRENT_LIST_DIR}/runner/rpc_mem.h
   ${CMAKE_CURRENT_LIST_DIR}/runner/kv_manager.cpp
 
@@ -4,13 +4,13 @@
 This file provides you the instructions to run LLAMA model with different parameters via Qualcomm HTP backend. We currently support the following models:
  1. LLAMA2 Stories 110M
  2. LLAMA3.2 1B
- 3. LLAMA3.2 3B (WIP)
+ 3. LLAMA3.2 3B
 
 We offer the following modes to execute the model:
 
-KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt.
+- KV Cache Mode: In KV Cache mode, the model takes in a single previous token and generates the next predicted token along with its KV cache. It is efficient for generating subsequent tokens after the initial prompt.
 
-Hybrid Mode: Hybrid mode leverages the strengths of both AR-N model and KV cache modes to optimize token generation speed. Initially, it uses AR-N model to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens.
+- Hybrid Mode: Hybrid mode leverages the strengths of both AR-N model and KV cache modes to optimize token generation speed. Initially, it uses AR-N model to efficiently generate the prompt's key-value (KV) cache. Then, the mode switches to KV cache mode, which excels at generating subsequent tokens.
   - AR-N model: The auto-regression (AR) length determines the number of tokens to consume and the number of logits to produce. Use it to process the prompt and generate the key-value (kv) cache, which serves as a prompt processor in hybrid mode.
   - Prompt processing with AR-N model: 
   <figure>
@@ -19,6 +19,7 @@ Hybrid Mode: Hybrid mode leverages the strengths of both AR-N model and KV cache
     </figcaption>
 </figure>
 
+- Lookahead Mode: Lookahead Mode introduces [lookahead decoding](https://arxiv.org/abs/2402.02057) and uses AR-N model to process prompt to enhance token generation speed. While decoding multiple tokens in a single step is infeasible, an LLM can generate multiple guess tokens in parallel. These guess tokens may fit into future parts of the generated sequence. The lookahead decoder generates and verifies these guess tokens, integrating them into the sequence if suitable. In some cases, it can obtain more than one token in a single step. Result is lossless.
 
 ## Instructions
 ### Note
@@ -127,3 +128,14 @@ You can select the KV Cache update mechanism at runtime by setting the `KV_UPDAT
 ```bash
 python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode hybrid --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --kv_updator ${KV_UPDATER}
 ```
+
+You can choose the lookahead mode to enhance decoding speed. To use this mode, you need to specify the following parameters:
+- `--ngram` (N-gram size): Represents the size of the n-grams used in the lookahead process.
+- `--window` (window size): Determines how many future tokens the algorithm attempts to predict in each step.
+- `--gcap` (Verification candidates): Represents the maximum number of speculations or candidate n-grams that the algorithm considers in each step for verification. It balances the trade-off between computation efficiency and exploring more possibilities.
+
+For more details, please refer to the paper ["Break the Sequential Dependency of LLM Inference Using Lookahead Decoding"](https://arxiv.org/abs/2402.02057)
+
+```bash
+python examples/qualcomm/oss_scripts/llama/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint consolidated.00.pth --params params.json --tokenizer_model tokenizer.model --llama_model llama3_2 --model_mode lookahead --prefill_ar_len 32 --max_seq_len 128 --prompt "what is 1+1" --ngram 3 --window 2 --gcap 2
+```
@@ -45,6 +45,17 @@ python_binary(
     ],
 )
 
+python_binary(
+    name = "eval_llama_qnn",
+    srcs = ["eval_llama_qnn.py"],
+    main_function = "executorch.examples.qualcomm.oss_scripts.llama.eval_llama_qnn.main",
+    deps = [
+        ":llama_lib",
+        "//executorch/examples/models/llama:eval_library",
+        "fbsource//third-party/pypi/lm-eval:lm-eval",
+    ],
+)
+
 runtime.command_alias(
     name = "llama_qnn",
     env = {