spinquant in eager mode (#5125)

Lunwen He · facebook-github-bot · commit 0af86736731e · 2024-09-09T10:42:51.000-07:00
Summary: Pull Request resolved: #5125 This PR adds the option to export the model with spin quant on gpu. Reviewed By: mergennachin Differential Revision: D62042861
diff --git a/examples/models/llama2/TARGETS b/examples/models/llama2/TARGETS
@@ -73,6 +73,7 @@ runtime.python_library(
         "source_transformation/quantize.py",
         "source_transformation/rope.py",
         "source_transformation/sdpa.py",
+        "source_transformation/spin_quant.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama2",
@@ -83,6 +84,7 @@ runtime.python_library(
         "@EXECUTORCH_CLIENTS",
     ],
     deps = [
+        "//ai_codesign/gen_ai/fast_hadamard_transform:fast_hadamard_transform",
         "//caffe2:torch",
         "//executorch/examples/models:model_base",
         "//executorch/examples/models:models",
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -315,6 +315,15 @@ def build_args_parser() -> argparse.ArgumentParser:
         default=False,
         help="Generate logits for all inputs.",
     )
+
+    parser.add_argument(
+        "-sq",
+        "--use_spin_quant",
+        type=str,
+        default=None,
+        choices=["cuda", "native"],
+        help="Use SpinQuant for better quantization performance. Only support cuda and native.",
+    )
     return parser
 
 
@@ -415,6 +424,18 @@ def _prepare_for_llama_export(modelname: str, args) -> LLMEdgeManager:
             # to get free perf gain.
             transforms.append(replace_sdpa_with_simple_sdpa)
             transforms.append(replace_causal_mask)
+
+    if args.use_spin_quant:
+        if args.use_spin_quant == "cuda":
+            from .source_transformation.spin_quant import (
+                inject_fast_hadamard_transform_cuda_for_spin_quant,
+            )
+
+            transforms.append(inject_fast_hadamard_transform_cuda_for_spin_quant)
+
+        elif args.use_spin_quant == "native":
+            raise NotImplementedError("native SpinQuant is not implemented yet.")
+
     return (
         _load_llama_model(
             modelname=modelname,
diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama2/source_transformation/spin_quant.py
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+# Helper functions for tranforming the model to be able to run SpinQuant.
+# See https://github.com/facebookresearch/SpinQuant for more details about SpinQuant.
+
+import torch
+
+import torch.nn.functional as F
+
+from executorch.examples.models.llama2.llama_transformer import FeedForward
+from torch import nn
+
+
+def _inject_fast_hadamard_transform_cuda_for_spin_quant(module: torch.nn.Module):
+    """
+    SpinQuant needs two Hadmard matrixes: R3 and R4. Here we are only injecting R4 in the feed forward layer.
+    R3 needs to be injected as well when KV cache quantization is enabled.
+    """
+    try:
+        from fast_hadamard_transform import hadamard_transform
+    except ImportError:
+        raise ImportError(
+            "Please install fast-hadamard-transform: pip install fast-hadamard-transform"
+        )
+
+    class FeedForwardCustom(nn.Module):
+        def __init__(self, w1, w2, w3):
+            super().__init__()
+            self.w1 = w1
+            self.w2 = w2
+            self.w3 = w3
+
+        def forward(self, x):
+            w = F.silu(self.w1(x)) * self.w3(x)
+            n = w.shape[-1]
+            return self.w2(hadamard_transform(w.contiguous()) / torch.tensor(n).sqrt())
+
+    for name, child in module.named_children():
+        if isinstance(child, FeedForward):
+            setattr(module, name, FeedForwardCustom(child.w1, child.w2, child.w3))
+        else:
+            _inject_fast_hadamard_transform_cuda_for_spin_quant(child)
+
+
+def inject_fast_hadamard_transform_cuda_for_spin_quant(
+    module: torch.nn.Module,
+) -> torch.nn.Module:
+    _inject_fast_hadamard_transform_cuda_for_spin_quant(module)
+    return module