reduce unnecessary transpose

haowhsu-quic · haowhsu-quic · commit 5e0e947652c4 · 2024-04-13T11:01:02.000+08:00
diff --git a/examples/qualcomm/llama2/llama.py b/examples/qualcomm/llama2/llama.py
@@ -21,6 +21,7 @@
     setup_common_args_and_variables,
     SimpleADB,
 )
+
 from sentencepiece import SentencePieceProcessor
 
 
@@ -55,7 +56,7 @@ def sample_top_p(probs: torch.Tensor, top_p: float) -> torch.Tensor:
         return probs_indices.gather(dim=-1, index=next_token)
 
     with torch.no_grad():
-        while token_list[-1] != sp_model.eos_id() and pos < 32:
+        while token_list[-1] != sp_model.eos_id() and pos < 128:
             logits, k_cache, v_cache, kv_mask = module(
                 torch.full((1, 1), token_list[pos]),
                 torch.full((1, 1), pos),
@@ -160,11 +161,10 @@ def sample_top_p(probs: torch.Tensor, top_p: float) -> torch.Tensor:
         config = ModelArgs(**json.load(f))
         # TODO: support batch inputs if necessary
         config.max_batch_size = 1
-        config.n_layers = 1
 
     state_dict = torch.load(args.checkpoint)
     instance = LlamaModel(config)
-    instance.load_state_dict(state_dict["model"], strict=False)
+    instance.load_state_dict(state_dict["model"])
     inputs = instance.get_example_inputs()
     input_list = create_device_inputs(inputs)
     pte_filename = "llama2_qnn"
@@ -199,6 +199,7 @@ def sample_top_p(probs: torch.Tensor, top_p: float) -> torch.Tensor:
             per_channel_linear=per_channel_linear,
             shared_buffer=args.shared_buffer,
             metadata=instance.get_metadata(),
+            direct_io=True,
         )
 
     if args.compile_only:
diff --git a/examples/qualcomm/llama2/model/static_llama.py b/examples/qualcomm/llama2/model/static_llama.py
@@ -64,13 +64,9 @@ def forward(
         v_cache = v_cache.view(bsz, self.max_seq_len, self.n_kv_heads, self.head_dim)
         k = k_cache * (1.0 - mask) + k * mask
         v = v_cache * (1.0 - mask) + v * mask
-        # (bs, n_local_heads, seqlen, head_dim)
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
 
-        attn = q @ k.transpose(-2, -1)
-        attn = attn * self.scale_tensor
-        attn = attn + atten_mask
+        attn = q.transpose(1, 2) @ k.permute(0, 2, 3, 1)
+        attn = attn * self.scale_tensor + atten_mask
         attn = self.attn_softmax(attn)
         y = attn @ v.transpose(1, 2)
         y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
@@ -182,7 +178,6 @@ def forward(
 
         # update kv cache
         output_k_cache = torch.concat(output_k_cache)
-        output_k_cache = output_k_cache.transpose(1, 2).contiguous()
         output_k_cache = output_k_cache.view(
             self.max_batch_size, self.n_layers, self.max_seq_len, self.dim
         )
diff --git a/examples/qualcomm/scripts/utils.py b/examples/qualcomm/scripts/utils.py
@@ -174,7 +174,7 @@ def build_executorch_binary(
     skip_node_op_set=None,
     quant_dtype: Optional[QuantDtype] = None,
     per_channel_linear=False,  # TODO: remove this once QNN fully supports linear
-    direct_io=True,  # TODO: temporal workaround for llama
+    direct_io=False,  # TODO: temporal workaround for llama
     shared_buffer=False,
     metadata=None,
 ):