Update on "Transform model to be able to use Attention Sink"

helunwencser · helunwencser · commit 1c0c17ca80b4 · 2024-11-06T18:09:09.000-08:00
This PR adds necessary functions for transforming the model to be able to use Attention Sink. Differential Revision: [D65571289](https://our.internmc.facebook.com/intern/diff/D65571289/) [ghstack-poisoned]
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -434,9 +434,9 @@ def build_args_parser() -> argparse.ArgumentParser:
 
     parser.add_argument(
         "--use_attention_sink",
-        default="4,2044,1024",
+        default=None,
         type=str,
-        help="Use attention sink to have fluent multi-round conversation. '<sink_size>,<window_size>,<batch_eviction_size>'"
+        help="Use attention sink to have fluent multi-round conversation. '<sink_size>,<window_size>,<batch_eviction_size>', e.g., '4,2044,1024'.",
     )
 
     parser.add_argument(
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -200,11 +200,10 @@ def __init__(self, **kwargs):
             )
 
             sanitize_checkpoint_from_pre_quantization(checkpoint)
-        
-        if hasattr(self.args, "use_attention_sink"):
-            from .source_transformation.sink_attention import (
-                enable_attention_sink,
-            )
+
+        if hasattr(self.args, "use_attention_sink") and self.args.use_attention_sink:
+            from .source_transformation.attention_sink import enable_attention_sink
+
             attention_sink_params = self.args.use_attention_sink.split(",")
             assert len(attention_sink_params) == 3
 
@@ -213,7 +212,8 @@ def __init__(self, **kwargs):
                 params=model_args,
                 sink_size=int(attention_sink_params[0]),
                 window_size=int(attention_sink_params[1]),
-                eviction_batch_size=int(attention_sink_params[2]))
+                eviction_batch_size=int(attention_sink_params[2]),
+            )
 
         # assign=True: load params/buffers by assignment instead of performing an in-place copy.
         # Because we are using device="meta", tensors do not have memory associated with them