Support SpinQuant to run on ET (#5435)

Lunwen He · facebook-github-bot · commit a556a2d73a78 · 2024-09-19T15:16:51.000-07:00
Summary: Pull Request resolved: #5435 This PR adds the option to run SpinQuant on ET. Reviewed By: mergennachin Differential Revision: D62526665 fbshipit-source-id: ff18110656d5ad90eb79020a1c2f6d235a9001b3
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -790,7 +790,11 @@ def _get_source_transforms(  # noqa
 
             transforms.append(inject_fast_hadamard_transform_cuda_for_spin_quant)
         elif args.use_spin_quant == "native":
-            raise NotImplementedError("native SpinQuant is not implemented yet.")
+            from .source_transformation.spin_quant import (
+                inject_fast_hadamard_transform_native_for_spin_quant,
+            )
+
+            transforms.append(inject_fast_hadamard_transform_native_for_spin_quant)
 
     if args.embedding_quantize:
         modelname = f"{modelname}_e"
diff --git a/examples/models/llama2/source_transformation/spin_quant.py b/examples/models/llama2/source_transformation/spin_quant.py
@@ -33,7 +33,7 @@ def _inject_fast_hadamard_transform_cuda_for_spin_quant(module: torch.nn.Module)
             "Please install fast-hadamard-transform: pip install fast-hadamard-transform"
         )
 
-    class FeedForwardCustom(nn.Module):
+    class FeedForwardCudaCustom(nn.Module):
         def __init__(self, w1, w2, w3):
             super().__init__()
             self.w1 = w1
@@ -47,7 +47,7 @@ def forward(self, x):
 
     for name, child in module.named_children():
         if isinstance(child, FeedForward):
-            setattr(module, name, FeedForwardCustom(child.w1, child.w2, child.w3))
+            setattr(module, name, FeedForwardCudaCustom(child.w1, child.w2, child.w3))
         else:
             _inject_fast_hadamard_transform_cuda_for_spin_quant(child)
 
@@ -59,6 +59,38 @@ def inject_fast_hadamard_transform_cuda_for_spin_quant(
     return module
 
 
+def _inject_fast_hadamard_transform_native_for_spin_quant(module: torch.nn.Module):
+    """
+    SpinQuant needs two Hadmard matrixes: R3 and R4. Here we are only injecting R4 in the feed forward layer.
+    R3 needs to be injected as well when KV cache quantization is enabled.
+    """
+
+    class FeedForwardNativeCustom(nn.Module):
+        def __init__(self, w1, w2, w3):
+            super().__init__()
+            self.w1 = w1
+            self.w2 = w2
+            self.w3 = w3
+
+        def forward(self, x):
+            return self.w2(
+                torch.ops.llama.fast_hadamard_transform(F.silu(self.w1(x)) * self.w3(x))
+            )
+
+    for name, child in module.named_children():
+        if isinstance(child, FeedForward):
+            setattr(module, name, FeedForwardNativeCustom(child.w1, child.w2, child.w3))
+        else:
+            _inject_fast_hadamard_transform_native_for_spin_quant(child)
+
+
+def inject_fast_hadamard_transform_native_for_spin_quant(
+    module: torch.nn.Module,
+) -> torch.nn.Module:
+    _inject_fast_hadamard_transform_native_for_spin_quant(module)
+    return module
+
+
 def _replace_linear_with_linear_8da4w_for_spin_quant(
     module: torch.nn.Module,
     checkpoint: Any,