pytorch · larryliu0820 · Mar 1, 2024
@@ -176,6 +176,15 @@ def to_dtype(self, dtype_override: Optional[DType]) -> "LlamaEdgeManager":
             logging.info(f"model.to {torch_dtype}")
             self.model = self.model.to(dtype=torch_dtype)
             self.dtype = dtype_override
+
+        # convert kv cache to dtype as well. This should be removed after mutable buffer is supported.
+        # assuming the kv cache are the last 2 tensors in the example inputs
+        if self.use_kv_cache:
+            dtype = torch.float16 if self.dtype == DType.fp16 else torch.float32
+            example_inputs = list(self.example_inputs[:-2]) + [
+                cache.to(dtype) for cache in self.example_inputs[-2:]
+            ]
+            self.example_inputs = tuple(example_inputs)
         return self
 
     def source_transform(

@@ -296,10 +296,6 @@ def forward(
         # tensor will be 2-dimensional, regarldess of the values of l & s
         mask = torch.squeeze(mask, [0, 1])
 
-        # FIXME: This should be so automatically! MKG
-        keys = keys.to(dtype=xq.dtype)
-        values = values.to(dtype=xq.dtype)
-
         output = F.scaled_dot_product_attention(
             xq, keys, values, attn_mask=mask, dropout_p=0.0
         )
@@ -672,8 +668,8 @@ def get_example_inputs(self):
 
     def get_example_inputs_kvcache(self):
         cache_sizes = self.model_.get_cache_sizes()
-        cache_k = torch.zeros(cache_sizes)
-        cache_v = torch.zeros(cache_sizes)
+        cache_k = torch.zeros(cache_sizes, dtype=self.dtype)
+        cache_v = torch.zeros(cache_sizes, dtype=self.dtype)
         return (
             torch.tensor(
                 [[1]], dtype=torch.long