Revert portion to move to next PR

jackzhxng · jackzhxng · commit 3145bde57f8c · 2024-11-01T12:05:32.000-07:00
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -54,7 +54,6 @@
 )
 from .source_transformation.quantized_kv_cache import (
     replace_kv_cache_with_quantized_kv_cache,
-    replace_torchtune_kv_cache_with_quantized_kv_cache,
 )
 from .source_transformation.rms_norm import replace_rms_norm_with_native_rms_norm
 
@@ -66,15 +65,10 @@
     replace_sdpa_with_coreml_sdpa,
     replace_sdpa_with_custom_op,
     replace_sdpa_with_flex_sdpa,
-    replace_sdpa_with_sdpa_only_custom_op,
     replace_sdpa_with_simple_sdpa,
 )
-
-from .source_transformation.torchtune.attention import replace_mha_with_inference_mha
-
 from .source_transformation.vulkan_rope import replace_with_vulkan_rotary_emb
 
-
 IS_FBCODE = True  #  os.environ.get("FBCODE_PLATFORM", False)
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
@@ -243,7 +237,7 @@ def build_args_parser() -> argparse.ArgumentParser:
         "--use_sdpa_with_kv_cache",
         default=False,
         action="store_true",
-        help="Whether to use a custom sdpa + kv_cache update when kv cache is enabled.",
+        help="Whether to use sdpa_with_kv_cache update op when using kv cache",
     )
     parser.add_argument(
         "--disable_dynamic_shape",
@@ -595,18 +589,6 @@ def _validate_args(args):
     if args.num_sharding > 0 and not args.qnn:
         raise ValueError("Model shard is only supported with qnn backend now.")
 
-    if args.model in TORCHTUNE_DEFINED_MODELS:
-        if args.use_sdpa_with_kv_cache:
-            if not args.use_kv_cache and not args.quantize_kv_cache:
-                raise ValueError(
-                    f"TorchTune-defined {args.model} only works with custom SDPA op + quantized KV cache at the moment. Please enable use_kv_cache and quantize_kv_cache when use_sdpa_with_kv_cache is enabled."
-                )
-        if args.use_kv_cache:
-            if not args.quantize_kv_cache:
-                raise ValueError(
-                    f"TorchTune-defined {args.model} only works with quantized KV cache at the moment. Please enable quantize_kv_cache when use_kv_cache is enabled."
-                )
-
 
 def _export_llama(args) -> LLMEdgeManager:  # noqa: C901
     _validate_args(args)
@@ -910,7 +892,6 @@ def _load_llama_model(
 def _get_source_transforms(  # noqa
     modelname: str, dtype_override: Optional[DType], args
 ) -> List[Callable[[torch.nn.Module], torch.nn.Module]]:
-    is_torchtune_model = modelname in TORCHTUNE_DEFINED_MODELS
     transforms = []
 
     if args.use_spin_quant:
@@ -962,29 +943,12 @@ def _get_source_transforms(  # noqa
     if args.expand_rope_table:
         transforms.append(materialze_broadcast_of_rope_freq_cis)
 
-    transforms.append(replace_mha_with_inference_mha)
     if args.use_sdpa_with_kv_cache:
-        if is_torchtune_model:
-            assert (
-                args.use_kv_cache and args.quantize_kv_cache
-            ), "use_sdpa_with_kv_cache requires use_kv_cache=True and quantize_kv_cache=True for TorchTune at the moment."
-            transforms.append(replace_mha_with_inference_mha)
-            transforms.append(replace_sdpa_with_sdpa_only_custom_op)
-        else:
-            transforms.append(replace_sdpa_with_custom_op)
+        transforms.append(replace_sdpa_with_custom_op)
 
     if args.quantize_kv_cache:
         assert args.use_kv_cache, "quantize_kv_cache requires use_kv_cache=True"
-        if is_torchtune_model:
-            transforms.append(
-                lambda module: replace_torchtune_kv_cache_with_quantized_kv_cache(
-                    module,
-                    is_transposed=not args.use_sdpa_with_kv_cache,
-                    enable_dynamic_shape=args.enable_dynamic_shape,
-                )
-            )
-        else:
-            transforms.append(replace_kv_cache_with_quantized_kv_cache)
+        transforms.append(replace_kv_cache_with_quantized_kv_cache)
 
     if args.use_kv_cache:
         if args.qnn:
@@ -1019,8 +983,4 @@ def _get_source_transforms(  # noqa
     if args.vulkan:
         transforms.append(replace_with_vulkan_rotary_emb)
 
-    print(
-        f"Performing the following source transformations: {[transform.__name__ for transform in transforms]}"
-    )
-
     return transforms
diff --git a/examples/models/llama/source_transformation/quantized_kv_cache.py b/examples/models/llama/source_transformation/quantized_kv_cache.py
@@ -11,7 +11,6 @@
 import torch.nn as nn
 from executorch.examples.models.llama.llama_transformer import KVCache
 from torch.ao.quantization.fx._decomposed import quantized_decomposed_lib  # noqa: F401
-from torchtune.modules.kv_cache import KVCache as TorchTuneKVCache
 
 
 """
@@ -208,31 +207,8 @@ def from_float(cls, kv_cache, cache_type: QuantizedCacheType):
             kv_cache.enable_dynamic_shape,
         )
 
-    @classmethod
-    def from_torchtune_float(
-        cls,
-        kv_cache,
-        cache_type: QuantizedCacheType,
-        is_transposed: bool,
-        enable_dynamic_shape: bool,
-    ):
-        cache_shape = kv_cache.k_cache.shape
-        if kv_cache.is_tranposed:
-            max_batch_size, n_heads, max_seq_length, head_dim = cache_shape
-        else:
-            max_batch_size, max_seq_length, n_heads, head_dim = cache_shape
-        return cls(
-            max_batch_size,
-            max_seq_length,
-            n_heads,
-            head_dim,
-            cache_type,
-            is_transposed,
-            enable_dynamic_shape,
-        )
-
 
-def replace_kv_cache_with_quantized_kv_cache(module: nn.Module) -> nn.Module:
+def replace_kv_cache_with_quantized_kv_cache(module):
     logging.warning(
         "Replacing KVCache with QuantizedKVCache. This modifies the model in place."
     )
@@ -246,41 +222,3 @@ def replace_kv_cache_with_quantized_kv_cache(module: nn.Module) -> nn.Module:
         else:
             replace_kv_cache_with_quantized_kv_cache(child)
     return module
-
-
-def replace_torchtune_kv_cache_with_quantized_kv_cache(
-    module: nn.Module, is_transposed: bool, enable_dynamic_shape: bool
-) -> nn.Module:
-    """
-    Replace TorchTune KVCache with Executorch's quantized KVCache.
-
-    Args:
-        is_transposed: whether q, k, and v are transposed. Should set to false  when sdpa custom op source transform is enabled.
-        enable_dynamic_shape: whether dynamic shapes are enabled.
-
-    Returns:
-        The passed in model.
-    """
-    logging.warning(
-        "Replacing KVCache with QuantizedKVCache. This modifies the model in place."
-    )
-    for name, child in module.named_children():
-        if isinstance(child, TorchTuneKVCache):
-            cache_shape = child.k_cache.shape
-            if is_transposed:
-                max_batch_size, n_heads, max_seq_length, head_dim = cache_shape
-            else:
-                max_batch_size, max_seq_length, n_heads, head_dim = cache_shape
-            setattr(
-                module,
-                name,
-                QuantizedKVCache.from_torchtune_float(
-                    child,
-                    QuantizedCacheType.AffineAsymmetric,
-                    is_transposed,
-                    enable_dynamic_shape,
-                ),
-            )
-        else:
-            replace_kv_cache_with_quantized_kv_cache(child)
-    return module
diff --git a/examples/models/llama/source_transformation/sdpa.py b/examples/models/llama/source_transformation/sdpa.py
@@ -80,7 +80,7 @@ def forward(
                 input_pos[0].item(),
                 seqlen,
                 None,  # Attention mask
-                0,  # Dropout probability, ignored by the code
+                0,  # dropout probability. Ignored by the code
                 True,  # is_causal
             )
         return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
@@ -105,65 +105,6 @@ def replace_sdpa_with_custom_op(module: torch.nn.Module) -> torch.nn.Module:
     return module
 
 
-class SDPAOnlyCustom(torch.nn.Module):
-    """
-    Just the custom SDPA op, no KV cache update included. Can only be used
-    in conjunction with a quantized KV cache.
-    """
-
-    def __init__(
-        self,
-    ):
-        super().__init__()
-
-    def forward(
-        self,
-        input_pos: torch.Tensor,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        bsz: int,
-        seqlen: int,
-        mask: torch.Tensor = None,
-    ):
-        # Custom op only supports float32 currently. Converting to/from float32 is
-        # faster than not having the op.
-        input_dtype = q.dtype
-        q = q.to(dtype=torch.float)
-        k = k.to(dtype=torch.float)
-        v = v.to(dtype=torch.float)
-        output = torch.ops.llama.custom_sdpa(
-            q,
-            k,
-            v,
-            input_pos[0].item(),
-            None,  # Attention mask
-            0,  # Dropout probability, ignored by the code.
-            True,  # is_causal
-        )
-        return output.view(bsz, seqlen, -1).to(dtype=input_dtype)
-
-
-def _replace_sdpa_with_sdpa_only_custom_op(module: torch.nn.Module):
-    for name, child in module.named_children():
-        if isinstance(child, SDPA):
-            assert (
-                child.kv_cache.cache_fp_type == torch.float32
-            ), "Only float32 is supported for custom SDPA"
-            setattr(
-                module,
-                name,
-                SDPAOnlyCustom(),
-            )
-        else:
-            _replace_sdpa_with_sdpa_only_custom_op(child)
-
-
-def replace_sdpa_with_sdpa_only_custom_op(module: torch.nn.Module) -> torch.nn.Module:
-    _replace_sdpa_with_sdpa_only_custom_op(module)
-    return module
-
-
 class SDPASimple(torch.nn.Module):
 
     def __init__(