Skip to content

Avoid converting k and v to q dtype #2201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions examples/models/llama2/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,15 @@ def to_dtype(self, dtype_override: Optional[DType]) -> "LlamaEdgeManager":
logging.info(f"model.to {torch_dtype}")
self.model = self.model.to(dtype=torch_dtype)
self.dtype = dtype_override

# convert kv cache to dtype as well. This should be removed after mutable buffer is supported.
# assuming the kv cache are the last 2 tensors in the example inputs
if self.use_kv_cache:
dtype = torch.float16 if self.dtype == DType.fp16 else torch.float32
example_inputs = list(self.example_inputs[:-2]) + [
cache.to(dtype) for cache in self.example_inputs[-2:]
]
self.example_inputs = tuple(example_inputs)
return self

def source_transform(
Expand Down
8 changes: 2 additions & 6 deletions examples/models/llama2/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,10 +296,6 @@ def forward(
# tensor will be 2-dimensional, regarldess of the values of l & s
mask = torch.squeeze(mask, [0, 1])

# FIXME: This should be so automatically! MKG
keys = keys.to(dtype=xq.dtype)
values = values.to(dtype=xq.dtype)

output = F.scaled_dot_product_attention(
xq, keys, values, attn_mask=mask, dropout_p=0.0
)
Expand Down Expand Up @@ -672,8 +668,8 @@ def get_example_inputs(self):

def get_example_inputs_kvcache(self):
cache_sizes = self.model_.get_cache_sizes()
cache_k = torch.zeros(cache_sizes)
cache_v = torch.zeros(cache_sizes)
cache_k = torch.zeros(cache_sizes, dtype=self.dtype)
cache_v = torch.zeros(cache_sizes, dtype=self.dtype)
return (
torch.tensor(
[[1]], dtype=torch.long
Expand Down