[Executorch][llama] Make RoPE freq calculation broadcast for per head

kimishpatel · kimishpatel · commit 7c6a7115ad60 · 2024-03-11T12:39:41.000-07:00
Pull Request resolved: #2353 This is a workaround, may not be even worth landing, to avoid broadcasting semantics in the mul op and for that matter any binary op. Current implementation of oiptimized ops doesnt handle broadcasting and falls back to portable op implementation. This diff also fixes an issue where (as seen in llama) two tensors of binary op are not broadcasting, but they have different # of dims, which results in invocation of unoptimized path. e.g. a = [1, 1, 2048], b = [2048], out = [1, 1, 2048]. In llama case this is optimized path when generating one token at a time. Not so during pre-fill Making optimized op handle broadcasting, and support vectorization, is not hard, but may take some time. ghstack-source-id: 218210434 @exported-using-ghexport Differential Revision: [D54766067](https://our.internmc.facebook.com/intern/diff/D54766067/)
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
@@ -109,20 +109,26 @@ def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
     )
 
 
-def precompute_freqs_cis(dim: int, end: int, theta: float):
+def precompute_freqs_cis(dim: int, n_heads: int, end: int, theta: float):
     freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
     t = torch.arange(end, device=freqs.device)  # pyre-ignore
     freqs = torch.outer(t, freqs).float()  # pyre-ignore
     freqs_cos = torch.cos(freqs)
     freqs_sin = torch.sin(freqs)
+    freqs_cos = freqs_cos.view(end, 1, dim // 2)
+    freqs_cos = freqs_cos.expand(end, n_heads, dim // 2).contiguous()
+    freqs_sin = freqs_sin.view(end, 1, dim // 2)
+    freqs_sin = freqs_sin.expand(end, n_heads, dim // 2).contiguous()
     return freqs_cos, freqs_sin
 
 
 def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
     ndim = x.ndim
     assert 0 <= 1 < ndim
-    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
-    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    assert freqs_cis.shape == (x.shape[1], x.shape[2], x.shape[-1])
+    shape = [
+        d if (i == 1 or i == 2 or i == ndim - 1) else 1 for i, d in enumerate(x.shape)
+    ]
     return freqs_cis.view(shape)
 
 
@@ -413,6 +419,7 @@ def __init__(self, params: ModelArgs):
 
         freqs_cos, freqs_sin = precompute_freqs_cis(
             params.dim // params.n_heads,
+            params.n_heads,
             params.max_seq_len,
             params.rope_freq_base,
         )
diff --git a/kernels/optimized/cpu/op_mul.cpp b/kernels/optimized/cpu/op_mul.cpp
@@ -20,6 +20,29 @@ namespace native {
 using Tensor = exec_aten::Tensor;
 using ScalarType = exec_aten::ScalarType;
 
+namespace {
+
+// Move to generic util as this is applicable to all binary ops
+bool can_use_optimized_path(
+    const Tensor& a,
+    const Tensor& b,
+    const Tensor& out) {
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  bool can_use_optimized_path = true;
+  can_use_optimized_path =
+      can_use_optimized_path && ((a_type == b_type) && (a_type == out_type));
+  can_use_optimized_path = can_use_optimized_path &&
+      (a_type != ScalarType::Half && b_type != ScalarType::Half);
+  can_use_optimized_path = can_use_optimized_path &&
+      (a.sizes().equals(b.sizes()) ||
+       (a.numel() == b.numel() && a.numel() == out.numel()));
+  return can_use_optimized_path;
+}
+} // namespace
+
 Tensor& opt_mul_out(
     RuntimeContext& ctx,
     const Tensor& a,
@@ -31,8 +54,7 @@ Tensor& opt_mul_out(
   ScalarType b_type = b.scalar_type();
   ScalarType out_type = out.scalar_type();
 
-  if (a_type == b_type && a_type == out_type && a.sizes().equals(b.sizes()) &&
-      a_type != ScalarType::Half) {
+  if (can_use_optimized_path(a, b, out)) {
     // Resize for dynamic shape
     auto error = resize_tensor(out, a.sizes());
     ET_CHECK_MSG(error == Error::Ok, "Failed to resize output tensor.");