[ExecuTorch] Allow using custom SDPA for non-float32 dtypes in llama demo

swolchok · swolchok · commit 59ba8119ad75 · 2024-09-23T10:50:07.000-07:00
Converting the input to and from float32 is faster than not using the op. h/t to torchchat, which does this already (though it had a bug, which I sent a patch for). Differential Revision: [D63158951](https://our.internmc.facebook.com/intern/diff/D63158951/) ghstack-source-id: 244181863 Pull Request resolved: #5548
diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama2/source_transformation/sdpa.py
@@ -23,7 +23,9 @@ def __init__(
         dim: int,
     ):
         super().__init__()
-        self.kv_cache = kv_cache
+        # Custom op only supports float32 currently. Converting to/from float32 is
+        # faster than not having the op.
+        self.kv_cache = kv_cache.to(torch.float)
         self.dim = dim
 
     def forward(
@@ -36,6 +38,12 @@ def forward(
         seqlen,
         mask,
     ):
+        # Custom op only supports float32 currently. Converting to/from float32 is
+        # faster than not having the op.
+        input_dtype = q.dtype
+        q = q.to(dtype=torch.float)
+        k = k.to(dtype=torch.float)
+        v = v.to(dtype=torch.float)
         output = torch.ops.llama.sdpa_with_kv_cache(
             q,
             k,
@@ -48,7 +56,7 @@ def forward(
             0,  # dropout probability. Ignored by the code
             True,  # is_causal
         )
-        return output.view(bsz, seqlen, self.dim)
+        return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
 
 
 def _replace_sdpa_with_custom_op(module: torch.nn.Module):