test sdpa with fp16 (#553)

mikekgfb · web-flow · commit e1c081591d53 · 2024-04-29T13:29:50.000-07:00
* test sdpa with fp16

* kv cache fp32

* typo
diff --git a/export_et_util.py b/export_et_util.py
@@ -9,6 +9,8 @@ class CustomKVCache(nn.Module):
     def __init__(self, max_batch_size, max_seq_length, n_heads, head_dim, dtype):
         super().__init__()
 
+        dtype = torch.float
+        
         # This is flipped around from what is in build.model's KVCache
         cache_shape = (max_batch_size, max_seq_length, n_heads, head_dim)
         self.register_buffer(
@@ -21,8 +23,8 @@ def __init__(self, max_batch_size, max_seq_length, n_heads, head_dim, dtype):
     def update(self, input_pos, k_val, v_val):
         k_out = self.k_cache
         v_out = self.v_cache
-        k_out[:, :, input_pos] = k_val
-        v_out[:, :, input_pos] = v_val
+        k_out[:, :, input_pos] = k_val.float()
+        v_out[:, :, input_pos] = v_val.float()
 
         return k_out, v_out
 
@@ -67,15 +69,15 @@ def forward(self, x, freqs_cis, mask, input_pos=None):
         # KV cache should always be enabled
         assert self.kv_cache is not None
         output = torch.ops.llama.sdpa_with_kv_cache(
-            q,
-            k,
-            v,
+            q.float(),
+            k.float(),
+            v.float(),
             self.kv_cache.k_cache,
             self.kv_cache.v_cache,
             input_pos[-1].item(),
             seqlen,
         )
-        output = output.view(bsz, seqlen, self.dim)
+        output = output.view(bsz, seqlen, self.dim).to(dtype=q.dtype)
         return self.wo(output)