Merge branch 'main' into migrate-benchmark-results-v3

huydhn · huydhn · commit f552ee8cb23c · 2024-11-27T16:07:21.000-08:00
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
@@ -135,7 +135,9 @@ def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
         self.quantize_io = quantize_io
         return self
 
-    def set_input_order(self, input_order: str = None) -> "ArmCompileSpecBuilder":
+    def set_input_order(
+        self, input_order: Optional[str] = None
+    ) -> "ArmCompileSpecBuilder":
         """
         Reorder the inputs coming in. This may be required when inputs > 1.
         And while using the U55/U85 CompileSpec.
diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -95,26 +95,26 @@ Tensor& add_out(
   }
 
   for (int i = 0; i < max_dim; i++) {
-    out_shape[i]  = 1;
+    out_shape[i] = 1;
     inp1_shape[i] = 1;
     inp2_shape[i] = 1;
   }
-  
-  int offset_out  = max_dim - out.dim();
+
+  int offset_out = max_dim - out.dim();
   int offset_inp1 = max_dim - a.dim();
   int offset_inp2 = max_dim - b.dim();
-  
+
   for (int i = 0; i < out.dim(); i++) {
     out_shape[i + offset_out] = out.size(i);
   }
   for (int i = 0; i < a.dim(); i++) {
     inp1_shape[i + offset_inp1] = a.size(i);
   }
   for (int i = 0; i < b.dim(); i++) {
-  inp2_shape[i + offset_inp2] = b.size(i); 
+    inp2_shape[i + offset_inp2] = b.size(i);
   }
 
-  if ((compute_type == ScalarType::Int) && (optimized)){
+  if ((compute_type == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp
@@ -87,23 +87,23 @@ Tensor& mul_out(
   }
 
   for (int i = 0; i < max_dim; i++) {
-    out_shape[i]  = 1;
+    out_shape[i] = 1;
     inp1_shape[i] = 1;
     inp2_shape[i] = 1;
   }
-  
-  int offset_out  = max_dim - out.dim();
+
+  int offset_out = max_dim - out.dim();
   int offset_inp1 = max_dim - a.dim();
   int offset_inp2 = max_dim - b.dim();
-  
+
   for (int i = 0; i < out.dim(); i++) {
     out_shape[i + offset_out] = out.size(i);
   }
   for (int i = 0; i < a.dim(); i++) {
     inp1_shape[i + offset_inp1] = a.size(i);
   }
   for (int i = 0; i < b.dim(); i++) {
-  inp2_shape[i + offset_inp2] = b.size(i); 
+    inp2_shape[i + offset_inp2] = b.size(i);
   }
 
   if ((compute_type == ScalarType::Int) && (optimized)) {
diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp
@@ -23,7 +23,6 @@ using torch::executor::apply_binary_elementwise_fn;
 using torch::executor::Error;
 using torch::executor::resize_to_broadcast_target_size;
 
-
 namespace cadence {
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp
@@ -351,4 +351,3 @@ Tensor& pow_Scalar_out(
 } // namespace HiFi
 } // namespace impl
 } // namespace cadence
-
diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp
@@ -26,8 +26,7 @@ using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
 
-
-    // The nnlib kernel to compute quantized linear via matmul.
+// The nnlib kernel to compute quantized linear via matmul.
 
 void _quantized_linear_asym8u(
     const Tensor& in,
@@ -48,22 +47,22 @@ void _quantized_linear_asym8u(
   const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
   uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();
   int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u(
-        out_data,
-        weight_data,
-        in_data,
-        bias_data,
-        out_dim,
-        in_dim,
-        in_dim,
-        leading_dims,
-        in_dim,
-        out_dim,
-        1,
+      out_data,
+      weight_data,
+      in_data,
+      bias_data,
+      out_dim,
+      in_dim,
+      in_dim,
+      leading_dims,
+      in_dim,
+      out_dim,
+      1,
       -weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
       -in_zero_point, // mat2_zero_bias
-        out_multiplier.const_data_ptr<int32_t>()[0],
-        out_shift.const_data_ptr<int32_t>()[0],
-        out_zero_point);
+      out_multiplier.const_data_ptr<int32_t>()[0],
+      out_shift.const_data_ptr<int32_t>()[0],
+      out_zero_point);
   ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed");
 }
 
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
@@ -93,6 +93,7 @@ runtime.python_library(
         "source_transformation/sdpa.py",
         "source_transformation/spin_quant.py",
         "source_transformation/vulkan_rope.py",
+        "source_transformation/attention_sink.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama",
@@ -213,3 +214,16 @@ runtime.python_test(
         "//executorch/examples/models/llama:llama_transformer",
     ],
 )
+
+runtime.python_test(
+    name = "attention_sink_test",
+    srcs = [
+        "source_transformation/test_attention_sink.py",
+    ],
+    supports_static_listing = False,
+    deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
+        "//caffe2:torch",
+        ":export_library",
+    ],
+)
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -147,6 +147,81 @@ def __post_init__(self):
             self.head_dim = self.dim // self.n_heads
 
 
+class Rope(torch.nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        if self.params.use_hf_rope:
+            self.precompute_freqs_cis = hf_precompute_freqs_cis
+        else:
+            self.precompute_freqs_cis = partial(
+                precompute_freqs_cis, use_scaled=self.params.use_scaled_rope
+            )
+        freqs_cos, freqs_sin = self.precompute_freqs_cis(
+            self.params.head_dim,
+            (
+                self.params.max_seq_len  # Normal llama2.
+                if self.params.ffn_dim_multiplier is None
+                else self.params.max_seq_len * 2  # Sharded checkpoint.
+            ),
+            self.params.rope_freq_base,
+        )
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+        if self.params.use_hf_rope:
+            self.apply_rotary_emb = hf_apply_rotary_emb
+        else:
+            self.apply_rotary_emb = RotaryEmbedding()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ):
+        return self.apply_rotary_emb(q, k, freqs_cos, freqs_sin)
+
+    def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int):
+        """
+        Get the precomputed frequencies for the given input position and sequence length.
+
+        Args:
+            input_pos (torch.Tensor): The input position tensor.
+            seq_len (int): The sequence length.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The precomputed frequencies for the given input position and sequence length.
+        """
+        if self.params.use_kv_cache:
+            assert (
+                input_pos is not None
+            ), "input_pos must be provided when use_kv_cache is True"
+
+            if self.params.enable_dynamic_shape:
+                # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos.
+                input_pos_item = input_pos[-1].item()
+                torch._check_is_size(input_pos_item)
+                torch._check(input_pos_item < self.params.max_seq_len)
+                # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
+                freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seq_len)
+                # pyre-ignore: Incompatible parameter type [6]
+                freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seq_len)
+            else:
+                # When not using dynamic shape, use of the .item results in
+                # symints, due to querying the data from tensor.
+                # this path avoids that for mps backend, although probably mps backend
+                # can support dynamic shape?
+                freqs_cos = self.freqs_cos[input_pos]
+                freqs_sin = self.freqs_sin[input_pos]
+
+        else:
+            assert input_pos is None, "input_pos is unused when use_kv_cache is False"
+            freqs_cos = self.freqs_cos[:seq_len]
+            freqs_sin = self.freqs_sin[:seq_len]
+        return freqs_cos, freqs_sin
+
+
 class KVCache(nn.Module):
     def __init__(
         self,
@@ -266,7 +341,7 @@ def forward(
 
 
 class Attention(nn.Module):
-    def __init__(self, args: ModelArgs, layer_id: int):
+    def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         super().__init__()
         self.use_kv_cache = args.use_kv_cache
         self.n_heads = args.n_heads
@@ -287,6 +362,8 @@ def __init__(self, args: ModelArgs, layer_id: int):
 
         self.layer_id = layer_id
 
+        self.rope = rope
+
         causal_mask = torch.tril(
             torch.ones(
                 self.max_seq_len,
@@ -303,7 +380,7 @@ def __init__(self, args: ModelArgs, layer_id: int):
                 args.max_seq_len,
                 self.n_kv_heads,
                 self.head_dim,
-                not args.use_sdpa_with_kv_cache_op,  # if we are using the custom op dont transpose the cache. Expect untransposed q k v
+                not args.use_sdpa_with_kv_cache_op,  # if we are using the custom op don't transpose the cache. Expect untransposed q k v
                 args.enable_dynamic_shape,
             )
             self.SDPA = SDPA(
@@ -314,10 +391,6 @@ def __init__(self, args: ModelArgs, layer_id: int):
                 max_seq_len=self.max_seq_len,
                 enable_dynamic_shape=args.enable_dynamic_shape,
             )
-        if args.use_hf_rope:
-            self.apply_rotary_emb = hf_apply_rotary_emb
-        else:
-            self.apply_rotary_emb = RotaryEmbedding()
 
     def forward(
         self,
@@ -336,7 +409,7 @@ def forward(
         v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
 
         # RoPE relative positional embeddings
-        q, k = self.apply_rotary_emb(q, k, freqs_cos, freqs_sin)
+        q, k = self.rope.forward(q, k, freqs_cos, freqs_sin)
 
         if self.use_kv_cache:
             assert input_pos is not None
@@ -424,13 +497,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class TransformerBlock(nn.Module):
-    def __init__(self, layer_id: int, args: ModelArgs):
+    def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
         super().__init__()
         self.use_kv_cache = args.use_kv_cache
         self.n_heads = args.n_heads
         self.dim = args.dim
         self.head_dim = args.head_dim
-        self.attention = Attention(args, layer_id)
+        self.attention = Attention(args, layer_id, rope)
         if args.moe:
             self.block_sparse_moe = MOEFeedForward(args)
         else:
@@ -459,33 +532,17 @@ def __init__(self, params: ModelArgs):
         self.n_layers = params.n_layers
 
         self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.rope = Rope(params)
         self.layers = torch.nn.ModuleList()
         for layer_id in range(params.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params))
+            self.layers.append(TransformerBlock(layer_id, params, self.rope))
         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
         self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
         self.use_kv_cache = params.use_kv_cache
         self.generate_full_logits = params.generate_full_logits
         self.max_seq_len = params.max_seq_len
         self.input_prune_map = params.input_prune_map
         self.output_prune_map = params.output_prune_map
-        if params.use_hf_rope:
-            self.precompute_freqs_cis = hf_precompute_freqs_cis
-        else:
-            self.precompute_freqs_cis = partial(
-                precompute_freqs_cis, use_scaled=params.use_scaled_rope
-            )
-        freqs_cos, freqs_sin = self.precompute_freqs_cis(
-            params.head_dim,
-            (
-                params.max_seq_len  # Normal llama2.
-                if params.ffn_dim_multiplier is None
-                else params.max_seq_len * 2  # Sharded checkpoint.
-            ),
-            params.rope_freq_base,
-        )
-        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
-        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
 
     def forward(
         self,
@@ -502,33 +559,7 @@ def forward(
         if tokens is not None and h is None:
             h = self.tok_embeddings(tokens)
         seqlen = h.shape[1]
-
-        if self.use_kv_cache:
-            assert (
-                input_pos is not None
-            ), "input_pos must be provided when use_kv_cache is True"
-
-            if self.params.enable_dynamic_shape:
-                # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos.
-                input_pos_item = input_pos[-1].item()
-                torch._check_is_size(input_pos_item)
-                torch._check(input_pos_item < self.params.max_seq_len)
-                # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
-                freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seqlen)
-                # pyre-ignore: Incompatible parameter type [6]
-                freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seqlen)
-            else:
-                # When not using dynamic shape, use of the .item results in
-                # symints, due to querying the data from tensor.
-                # this path avoids that for mps backend, although probably mps backend
-                # can support dynamic shape?
-                freqs_cos = self.freqs_cos[input_pos]
-                freqs_sin = self.freqs_sin[input_pos]
-
-        else:
-            assert input_pos is None, "input_pos is unused when use_kv_cache is False"
-            freqs_cos = self.freqs_cos[:seqlen]
-            freqs_sin = self.freqs_sin[:seqlen]
+        freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, seqlen)
 
         for layer in self.layers:
             h = layer(
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
diff --git a/examples/models/llama/source_transformation/rope.py b/examples/models/llama/source_transformation/rope.py
diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py

Original file line number	Diff line number	Diff line change
`@@ -87,23 +87,23 @@ Tensor& mul_out(`
`87`	`87`	`}`
`88`	`88`
`89`	`89`	`for (int i = 0; i < max_dim; i++) {`
`90`		`- out_shape[i] = 1;`
	`90`	`+ out_shape[i] = 1;`
`91`	`91`	`inp1_shape[i] = 1;`
`92`	`92`	`inp2_shape[i] = 1;`
`93`	`93`	`}`
`94`		`-`
`95`		`- int offset_out = max_dim - out.dim();`
	`94`	`+`
	`95`	`+ int offset_out = max_dim - out.dim();`
`96`	`96`	`int offset_inp1 = max_dim - a.dim();`
`97`	`97`	`int offset_inp2 = max_dim - b.dim();`
`98`		`-`
	`98`	`+`
`99`	`99`	`for (int i = 0; i < out.dim(); i++) {`
`100`	`100`	`out_shape[i + offset_out] = out.size(i);`
`101`	`101`	`}`
`102`	`102`	`for (int i = 0; i < a.dim(); i++) {`
`103`	`103`	`inp1_shape[i + offset_inp1] = a.size(i);`
`104`	`104`	`}`
`105`	`105`	`for (int i = 0; i < b.dim(); i++) {`
`106`		`- inp2_shape[i + offset_inp2] = b.size(i);`
	`106`	`+ inp2_shape[i + offset_inp2] = b.size(i);`
`107`	`107`	`}`
`108`	`108`
`109`	`109`	`if ((compute_type == ScalarType::Int) && (optimized)) {`