Allow using custom SDPA for non-float32 dtypes in llama demo (#5548)

swolchok · facebook-github-bot · commit cab6335bfea4 · 2024-09-23T13:51:14.000-07:00
Summary: Pull Request resolved: #5548 Converting the input to and from float32 is faster than not using the op. h/t to torchchat, which does this already (though it had a bug, which I sent a patch for). Reviewed By: kimishpatel Differential Revision: D63158951 fbshipit-source-id: 58c90d141ee403536c03a3b731f8547790fc9440
diff --git a/examples/models/llama2/source_transformation/sdpa.py b/examples/models/llama2/source_transformation/sdpa.py
@@ -23,7 +23,9 @@ def __init__(
         dim: int,
     ):
         super().__init__()
-        self.kv_cache = kv_cache
+        # Custom op only supports float32 currently. Converting to/from float32 is
+        # faster than not having the op.
+        self.kv_cache = kv_cache.to(torch.float)
         self.dim = dim
 
     def forward(
@@ -36,6 +38,12 @@ def forward(
         seqlen,
         mask,
     ):
+        # Custom op only supports float32 currently. Converting to/from float32 is
+        # faster than not having the op.
+        input_dtype = q.dtype
+        q = q.to(dtype=torch.float)
+        k = k.to(dtype=torch.float)
+        v = v.to(dtype=torch.float)
         output = torch.ops.llama.sdpa_with_kv_cache(
             q,
             k,
@@ -48,7 +56,7 @@ def forward(
             0,  # dropout probability. Ignored by the code
             True,  # is_causal
         )
-        return output.view(bsz, seqlen, self.dim)
+        return output.view(bsz, seqlen, self.dim).to(dtype=input_dtype)
 
 
 def _replace_sdpa_with_custom_op(module: torch.nn.Module):