Update base for Update on "qnn end to end flow"

cccclai · cccclai · commit ae9227e2d434 · 2024-04-15T15:58:02.000-07:00
Patch a few changes including: - support bool tensor type - support fp16 and fix the 8w8a quantization. - add two non-supported ops (slice_scatter and index_put) in common_defs.py stories model working end to end: AOT: fp16: ``` python -m examples.models.llama2.export_llama -kv --qnn -c stories110M.pt -p params.json ``` quantize: ``` python -m examples.models.llama2.export_llama -kv --qnn --pt2e_quantize -c stories110M.pt -p params.json ``` Runtime: ``` /llama_main --model_path=llama2_fp16_qnn_2.21.pte --tokenizer_path=tokenizer.bin --prompt="Once" ``` Output: ``` Once upon a time, there was a boy named Tim. Tim had a pet dog named Max. Max was a big, strong dog. They liked to play and run in the park. One day, Tim and Max went to the park to play. They saw a cat. The cat was up in a tree. Max wanted to help the cat. He tried to climb the tree, but he could not. Then, something unexpected happened. Max started to climb the tree! He was very strong. Max helped the cat come down. The cat was happy. Tim was so proud of his pet. ``` Stories model is too small and sensitive to qunatization. Differential Revision: [D56119738](https://our.internmc.facebook.com/intern/diff/D56119738/) [ghstack-poisoned]
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-0a038cf0cff2d071b7359ac0491fd2ba7798a438
+868e5ced5df34f1aef3703654f76e03f5126b534
diff --git a/backends/vulkan/runtime/api/Adapter.cpp b/backends/vulkan/runtime/api/Adapter.cpp
@@ -401,8 +401,7 @@ std::string Adapter::stringize() const {
   ss << "  Memory Info {" << std::endl;
   ss << "    Memory Types [" << std::endl;
   for (size_t i = 0; i < mem_props.memoryTypeCount; ++i) {
-    ss << "      "
-       << " [Heap " << mem_props.memoryTypes[i].heapIndex << "] "
+    ss << "      " << " [Heap " << mem_props.memoryTypes[i].heapIndex << "] "
        << get_memory_properties_str(mem_props.memoryTypes[i].propertyFlags)
        << std::endl;
   }
diff --git a/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp b/backends/vulkan/runtime/graph/ops/OperatorRegistry.cpp
@@ -16,7 +16,9 @@ bool OperatorRegistry::has_op(const std::string& name) {
 
 OperatorRegistry::OpFunction& OperatorRegistry::get_op_fn(
     const std::string& name) {
-  return table_.find(name)->second;
+  const auto it = table_.find(name);
+  VK_CHECK_COND(it != table_.end(), "Could not find operator with name ", name);
+  return it->second;
 }
 
 void OperatorRegistry::register_op(const std::string& name, OpFunction& fn) {
diff --git a/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h b/backends/vulkan/runtime/graph/ops/glsl/indexing_utils.h
@@ -50,8 +50,8 @@
 // describe sizes. As an example, let's say we want to swap dimensions 0,1 for a
 // tensor of shape {4,3,2,24} to obtain {3,4,2,24}. Then, x=4, y=3 and
 // plane=2*24=48.
-#define SWAP_ADJ_DIMS(cur, x, y, plane)                       \
-  cur +                                                       \
-      plane*(                                                 \
-          (1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \
-          (x - 1) * ((cur % (y * plane)) / plane))
+#define SWAP_ADJ_DIMS(cur, x, y, plane)                        \
+  cur +                                                        \
+      plane *                                                  \
+          ((1 - y) * ((cur % (x * y * plane)) / (y * plane)) + \
+           (x - 1) * ((cur % (y * plane)) / plane))
diff --git a/docs/README.md b/docs/README.md
@@ -57,7 +57,11 @@ To build the documentation locally:
    ```bash
    pip3 install -r ./.ci/docker/requirements-ci.txt
    ```
+1. Update submodules
 
+   ```bash
+   git submodule sync && git submodule update --init
+   ```
 1. Run:
 
    ```bash
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -34,7 +34,6 @@
 from executorch.sdk.etrecord import generate_etrecord
 from executorch.util.activation_memory_profiler import generate_memory_trace
 from sentencepiece import SentencePieceProcessor
-from torch.nn import functional as F
 
 from .builder import DType, LlamaEdgeManager, load_llama_model, WeightType
 from .quant_lib import _get_pt2e_quantization_params, get_pt2e_quantizers
@@ -174,17 +173,17 @@ def forward(
         v = v.transpose(1, 2)
 
         k, v = self.kv_cache.update(input_pos, k, v)
-        mask = mask[None, None, input_pos]
+        attn_mask = mask[None, None, input_pos]
 
         k = k.repeat_interleave(self.n_rep, dim=1)
         v = v.repeat_interleave(self.n_rep, dim=1)
-        scores = torch.matmul(q, k.transpose(2, 3)) / math.sqrt(self.head_dim)
-        scores = F.softmax(scores.float(), dim=-1).type_as(q)
-        scores = scores + mask
-        output = torch.matmul(scores, v)  # (bs, n_local_heads, seqlen, head_dim)
+        scale_factor = 1 / math.sqrt(q.size(-1))
+        attn_weight = q @ k.transpose(-2, -1) * scale_factor
+        attn_weight += attn_mask
+        attn_weight = torch.softmax(attn_weight, dim=-1)
+        y = attn_weight @ v
 
-        output = output.transpose(1, 2).contiguous().view(bsz, seqlen, -1)
-        return output
+        return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
 
 
 def replace_sdpa_with_simple_sdpa(module: torch.nn.Module):
@@ -200,6 +199,24 @@ def replace_sdpa_with_simple_sdpa(module: torch.nn.Module):
     return module
 
 
+def replace_causal_mask(module: torch.nn.Module):
+    for buffer_fqn_name, buffer in module.named_buffers():
+        buffer_name = buffer_fqn_name.split(".")[-1]
+        if buffer_name == "mask":
+            max_seq_len = buffer.shape[-1]
+            mask = torch.full(
+                (max_seq_len, max_seq_len),
+                float("-inf"),
+                device="cpu",
+            )
+
+            mask = torch.triu(mask, diagonal=1)
+            module.register_buffer(buffer_name, mask)
+    for _, child in module.named_children():
+        replace_causal_mask(child)
+    return module
+
+
 def quantize(
     model: torch.nn.Module,
     qmode: str,
diff --git a/examples/models/llama2/runner/runner.cpp b/examples/models/llama2/runner/runner.cpp
@@ -472,8 +472,7 @@ std::string statsToJsonString(const Runner::Stats& stats) {
      << "\"prompt_eval_end_ms\":" << stats.prompt_eval_end_ms << ","
      << "\"first_token_ms\":" << stats.first_token_ms << ","
      << "\"aggregate_sampling_time_ms\":" << stats.aggregate_sampling_time_ms
-     << ","
-     << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
+     << "," << "\"SCALING_FACTOR_UNITS_PER_SECOND\":"
      << stats.SCALING_FACTOR_UNITS_PER_SECOND << "}";
   return ss.str();
 }
diff --git a/examples/models/llama2/tests/TARGETS b/examples/models/llama2/tests/TARGETS
@@ -0,0 +1,15 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+oncall("executorch")
+
+runtime.python_unittest(
+    name = "test_simple_sdpa",
+    srcs = [
+        "test_simple_sdpa.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama2:export_library",
+        "//executorch/examples/models/llama2:llama_transformer",
+    ],
+)
diff --git a/examples/models/llama2/tests/test_simple_sdpa.py b/examples/models/llama2/tests/test_simple_sdpa.py
@@ -0,0 +1,54 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import unittest
+
+import torch
+from executorch.examples.models.llama2.export_llama_lib import SDPASimple
+from executorch.examples.models.llama2.llama_transformer import KVCache, SDPA
+
+
+class SDPATest(unittest.TestCase):
+    def test_simple_sdpa(self):
+        # Verify the correctness between the simple SDPA and the original SDPA module defined in llama_transformer.py
+        max_batch_size = 1
+        max_seq_length = 128
+        n_heads = 8
+        head_dim = 8
+        dim = 64
+        n_rep = 1
+        bsz = 1
+        seqlen = 1
+        n_local_heads = n_heads
+        kv_cache = KVCache(
+            max_batch_size=max_batch_size,
+            max_seq_length=max_seq_length,
+            n_heads=n_heads,
+            head_dim=head_dim,
+            transpose_cache=True,
+        )
+        sdpa = SDPA(
+            kv_cache=copy.deepcopy(kv_cache), dim=dim, head_dim=head_dim, n_rep=n_rep
+        )
+        input_pos = torch.tensor([0])
+        query = torch.randn(1, 1, n_local_heads, head_dim)
+        key = torch.randn(1, 1, n_local_heads, head_dim)
+        value = torch.randn(1, 1, n_local_heads, head_dim)
+        mask = torch.randn(max_seq_length, max_seq_length)
+        sdpa_output = sdpa(
+            input_pos, query, key, value, bsz=bsz, seqlen=seqlen, mask=mask
+        )
+
+        simple_sdpa = SDPASimple(
+            kv_cache=copy.deepcopy(kv_cache), dim=dim, head_dim=head_dim, n_rep=n_rep
+        )
+        simple_sdpa_output = simple_sdpa(
+            input_pos, query, key, value, bsz=bsz, seqlen=seqlen, mask=mask
+        )
+
+        # Compare the output from output from two sdpa implementation
+        self.assertTrue(torch.allclose(sdpa_output, simple_sdpa_output))
diff --git a/kernels/portable/cpu/op_cumsum.cpp b/kernels/portable/cpu/op_cumsum.cpp
@@ -11,8 +11,8 @@
 #include <executorch/runtime/platform/assert.h>
 #include <cmath>
 #include <cstddef>
-//#include <cstdint>
-//#include <type_traits>
+// #include <cstdint>
+// #include <type_traits>
 
 namespace torch {
 namespace executor {
diff --git a/runtime/core/portable_type/optional.h b/runtime/core/portable_type/optional.h
@@ -74,8 +74,8 @@ class optional final {
   }
 
   optional& operator=(optional&& rhs) noexcept(
-      std::is_nothrow_move_assignable<T>::value&&
-          std::is_nothrow_move_constructible<T>::value) {
+      std::is_nothrow_move_assignable<T>::value &&
+      std::is_nothrow_move_constructible<T>::value) {
     if (init_ && !rhs.init_) {
       clear();
     } else if (!init_ && rhs.init_) {
diff --git a/sdk/etdump/etdump_flatcc.cpp b/sdk/etdump/etdump_flatcc.cpp
@@ -103,7 +103,8 @@ ETDumpGen::ETDumpGen(Span<uint8_t> buffer) {
     alloc.set_buffer(
         (uint8_t*)buffer_with_builder,
         buffer_size,
-        (size_t)((buffer_size / 4 > max_alloc_buf_size) ? max_alloc_buf_size : buffer_size / 4));
+        (size_t)((buffer_size / 4 > max_alloc_buf_size) ? max_alloc_buf_size
+                                                        : buffer_size / 4));
     et_flatcc_custom_init(builder, &alloc);
   } else {
     builder = (struct flatcc_builder*)malloc(sizeof(struct flatcc_builder));
diff --git a/third-party/pytorch b/third-party/pytorch
@@ -1 +1 @@
-Subproject commit e3ea31662334770bbbb6da4abd881abc875e04c3
+Subproject commit 868e5ced5df34f1aef3703654f76e03f5126b534

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-0a038cf0cff2d071b7359ac0491fd2ba7798a438`
	`1`	`+868e5ced5df34f1aef3703654f76e03f5126b534`
Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,9 @@ bool OperatorRegistry::has_op(const std::string& name) {`
`16`	`16`
`17`	`17`	`OperatorRegistry::OpFunction& OperatorRegistry::get_op_fn(`
`18`	`18`	`const std::string& name) {`
`19`		`- return table_.find(name)->second;`
	`19`	`+ const auto it = table_.find(name);`
	`20`	`+ VK_CHECK_COND(it != table_.end(), "Could not find operator with name ", name);`
	`21`	`+ return it->second;`
`20`	`22`	`}`
`21`	`23`
`22`	`24`	`void OperatorRegistry::register_op(const std::string& name, OpFunction& fn) {`