Update

vmpuri · vmpuri · commit 5f6fba2d3ac7 · 2024-07-17T14:10:44.000-07:00
[ghstack-poisoned]
diff --git a/generate.py b/generate.py
@@ -323,9 +323,10 @@ def decode_n_tokens(
             # Actually better for Inductor to codegen attention here
             with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
 
+                out_token = cur_token.clone()
                 next_token, next_prob = self.decode_one_token(
                     model,
-                    cur_token.clone(),
+                    out_token,
                     input_pos,
                     need_probs=need_probs,
                     **sampling_kwargs,
@@ -334,10 +335,10 @@ def decode_n_tokens(
                 new_tokens.append(next_token.clone())
                 callback(new_tokens[-1], done_generating=_i == num_new_tokens - 2)
                 if need_probs or next_prob is None:
-                    yield cur_token.clone(), None
+                    yield out_token, None
                 else:
                     new_probs.append(next_prob.clone())
-                    yield cur_token.clone(), next_prob.clone()
+                    yield out_token, next_prob.clone()
                 cur_token = next_token
 
                 # encountered eos