Final final fix

larryliu0820 · larryliu0820 · commit 13d2fa7268ee · 2024-04-05T14:06:50.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/examples/models/llama2/custom_ops/CMakeLists.txt b/examples/models/llama2/custom_ops/CMakeLists.txt
@@ -44,7 +44,7 @@ include(${EXECUTORCH_SRCS_FILE})
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
 # Custom op libraries
-set(custom_ops_libs extension_module executorch)
+set(custom_ops_libs executorch)
 list(APPEND custom_ops_libs pthreadpool)
 list(APPEND custom_ops_libs cpuinfo)
 list(APPEND custom_ops_libs cpublas)
diff --git a/examples/models/llama2/custom_ops/op_sdpa_aot.cpp b/examples/models/llama2/custom_ops/op_sdpa_aot.cpp
@@ -85,12 +85,18 @@ TORCH_LIBRARY(llama, m) {
   m.def(
       "sdpa_with_kv_cache(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, "
       "Tensor(b!) value_cache, SymInt start_pos, SymInt seq_len, Tensor? attn_mask=None, "
-      "float drpout_p=0.0, bool is_causal=False, float? scale=None) -> Tensor",
-      &torch::executor::native::sdpa_with_kv_cache_aten);
+      "float drpout_p=0.0, bool is_causal=False, float? scale=None) -> Tensor");
   m.def(
       "sdpa_with_kv_cache.out(Tensor query, Tensor key, Tensor value, Tensor(a!) key_cache, "
       "Tensor(b!) value_cache, SymInt start_pos, SymInt seq_len, Tensor? attn_mask=None, "
-      "float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(c!) out) -> Tensor(c!)",
+      "float drpout_p=0.0, bool is_causal=False, float? scale=None, *, Tensor(c!) out) -> Tensor(c!)");
+}
+
+TORCH_LIBRARY_IMPL(llama, CompositeExplicitAutograd, m) {
+  m.impl(
+      "sdpa_with_kv_cache", torch::executor::native::sdpa_with_kv_cache_aten);
+  m.impl(
+      "sdpa_with_kv_cache.out",
       WRAP_TO_ATEN(
           torch::executor::native::sdpa_with_kv_cache_out_no_context, 11));
 }
diff --git a/examples/models/llama2/custom_ops/sdpa_with_kv_cache.py b/examples/models/llama2/custom_ops/sdpa_with_kv_cache.py
@@ -13,6 +13,8 @@
 
 import torch
 
+from torch.library import impl
+
 try:
     op = torch.ops.llama.sdpa_with_kv_cache.default
     assert op is not None
@@ -30,3 +32,104 @@
             torch.ops.load_library(full_path)
             op = torch.ops.llama.sdpa_with_kv_cache.default
             assert op is not None
+
+custom_ops_lib = torch.library.Library("llama", "IMPL")
+
+
+def _validate_params(
+    query,
+    key,
+    value,
+    key_cache,
+    value_cache,
+    start_pos,
+    seq_len,
+    attn_mask,
+    drpout_p,
+    is_causal,
+    scale,
+):
+    assert (
+        query.dim() == 4
+    ), f"Expected query to be 4 dimensional but got {query.dim()} dimensions."
+    assert (
+        key.dim() == 4
+    ), f"Expected key to be 4 dimensional but got {key.dim()} dimensions."
+    assert (
+        value.dim() == 4
+    ), f"Expected value to be 4 dimensional but got {value.dim()} dimensions."
+
+    assert (
+        query.dtype == torch.float32
+    ), f"Expected query to be float32 but got {query.dtype}"
+    assert key.dtype == torch.float32, f"Expected key to be float32 but got {key.dtype}"
+    assert (
+        value.dtype == torch.float32
+    ), f"Expected value to be float32 but got {value.dtype}"
+
+    assert (
+        key_cache.dim() == 4
+    ), f"Expected key_cache to be 4 dimensional but got {key_cache.dim()}"
+    assert (
+        value_cache.dim() == 4
+    ), f"Expected value_cache to be 4 dimensional but got {value_cache.dim()}"
+
+    assert (
+        key_cache.dtype == torch.float32
+    ), f"Expected key_cache to be float32 but got {key_cache.dtype}"
+    assert (
+        value_cache.dtype == torch.float32
+    ), f"Expected value_cache to be float32 but got {value_cache.dtype}"
+
+    assert (
+        key_cache.size() == value_cache.size()
+    ), f"Key cache and value cache must have same size but got {key_cache.size()} and {value_cache.size()}"
+
+    # These asserts are real but they require me to add constrain_as_size/value calls to the model and I dont want to do that right now
+    # assert start_pos < key_cache.size(
+    #     1
+    # ), f"Start position {start_pos} must be less than sequence length {key_cache.size(2)}"
+    # assert (start_pos + seq_len) < key_cache.size(
+    #     1
+    # ), f"Start position  + length = {start_pos + seq_len} must be less than sequence length {key_cache.size(2)}"
+
+    assert seq_len == 1, "Only support seq_len = 1 for now."
+
+    if attn_mask is not None:
+        assert (
+            attn_mask.dim() == 2
+        ), f"Expected attn_mask to be 2 dimensional but got {attn_mask.dim()} dimensions."
+        assert (attn_mask.dtype == torch.float32) or (
+            attn_mask.dtype == torch.float16
+        ), f"Expected attn_mask to be float but got {attn_mask.dtype}"
+
+
+@impl(custom_ops_lib, "sdpa_with_kv_cache", "Meta")
+def sdpa_with_kv_cache_meta(
+    query,
+    key,
+    value,
+    key_cache,
+    value_cache,
+    start_pos,
+    seq_len,
+    attn_mask=None,
+    drpout_p=0.0,
+    is_causal=False,
+    scale=None,
+):
+    _validate_params(
+        query,
+        key,
+        value,
+        key_cache,
+        value_cache,
+        start_pos,
+        seq_len,
+        attn_mask,
+        drpout_p,
+        is_causal,
+        scale,
+    )
+
+    return torch.empty_like(query)