Avoid converting k and v to q dtype (#2201)

larryliu0820 · facebook-github-bot · commit 5a18cc6e1201 · 2024-03-01T16:24:16.000-08:00
Summary: Pull Request resolved: #2201 We don't want to hard code k and v into q's dtype, instead we should make sure they are always the same before feeding into sdpa. The problem was due to dtype mismatch between the kv cache used for tracing and the actual weights. This happens in the fp16 flow. After we convert the whole model to fp16, we still use fp32 kv cache tensors for tracing and that's causes the dtype mismatch issue. This diff changes the logic to be using the same dtype as the weights, for kv cache during tracing Reviewed By: mikekgfb Differential Revision: D54426672 fbshipit-source-id: d34009fedc59ebf5ba7ee77e26341a1f99340df6
diff --git a/examples/models/llama2/builder.py b/examples/models/llama2/builder.py
@@ -176,6 +176,15 @@ def to_dtype(self, dtype_override: Optional[DType]) -> "LlamaEdgeManager":
             logging.info(f"model.to {torch_dtype}")
             self.model = self.model.to(dtype=torch_dtype)
             self.dtype = dtype_override
+
+        # convert kv cache to dtype as well. This should be removed after mutable buffer is supported.
+        # assuming the kv cache are the last 2 tensors in the example inputs
+        if self.use_kv_cache:
+            dtype = torch.float16 if self.dtype == DType.fp16 else torch.float32
+            example_inputs = list(self.example_inputs[:-2]) + [
+                cache.to(dtype) for cache in self.example_inputs[-2:]
+            ]
+            self.example_inputs = tuple(example_inputs)
         return self
 
     def source_transform(
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
@@ -296,10 +296,6 @@ def forward(
         # tensor will be 2-dimensional, regarldess of the values of l & s
         mask = torch.squeeze(mask, [0, 1])
 
-        # FIXME: This should be so automatically! MKG
-        keys = keys.to(dtype=xq.dtype)
-        values = values.to(dtype=xq.dtype)
-
         output = F.scaled_dot_product_attention(
             xq, keys, values, attn_mask=mask, dropout_p=0.0
         )
@@ -672,8 +668,8 @@ def get_example_inputs(self):
 
     def get_example_inputs_kvcache(self):
         cache_sizes = self.model_.get_cache_sizes()
-        cache_k = torch.zeros(cache_sizes)
-        cache_v = torch.zeros(cache_sizes)
+        cache_k = torch.zeros(cache_sizes, dtype=self.dtype)
+        cache_v = torch.zeros(cache_sizes, dtype=self.dtype)
         return (
             torch.tensor(
                 [[1]], dtype=torch.long