make num samples work for directed prompt-based sequence generation (#715)

mikekgfb · malfet · commit e15b5093842e · 2024-07-17T09:55:45.000-07:00
diff --git a/generate.py b/generate.py
@@ -519,8 +519,10 @@ def _main(
 
     tokenizer = _initialize_tokenizer(tokenizer_args)
 
-    # Right now the assumption is only llama3 uses tiktokenizer and it must use tiktokenizer.
-    # Piggy backing off of this flag then for now to identify llama3 without prompting user.
+    # Right now the assumption is only llama3 uses tiktokenizer and it
+    # must use tiktokenizer.
+    # Piggy backing off of this flag then for now to identify llama3
+    # without prompting user.
     is_llama3_model = tokenizer_args.is_tiktoken
     if generator_args.chat_mode and is_llama3_model:
         logging.debug(
@@ -610,7 +612,8 @@ def _main(
     start = -1 if generator_args.compile else 0
     start_pos = 0
 
-    # arbitrarily large number as chat mode goes until max_seq length or user exits
+    # arbitrarily large number as chat mode goes until max_seq length
+    # or user exits
     num_samples = generator_args.num_samples if not generator_args.chat_mode else 100000
     i = (
         -1
@@ -743,28 +746,33 @@ def callback(x):
         tokens_generated = y.size(0) - prompt_length
         tokens_sec = tokens_generated / t
         aggregate_metrics["tokens_per_sec"].append(tokens_sec)
-        logging.debug(
+        print(
             f"Time for inference {i + 1}: {t:.02f} sec total, {tokens_sec:.02f} tokens/sec"
         )
-        logging.debug(f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s")
+        print(f"Bandwidth achieved: {model_size * tokens_sec / 1e9:.02f} GB/s")
 
         if start_pos >= max_seq_length:
-            print("Max Sequence Length Reached. Ending Conversation.")
-            break
+            print(f"[Max Sequence Length Reached. Ending Conversation.]")
+            print(f"---------------------------------------------------")
+            if generator_args.chat_mode:
+                break
 
-    print("==========")
+        if not generator_args.chat_mode:
+            start_pos = 0
+
+    print("\n========================================\n")
     if is_speculative:
         counts_aggregated = [sum(i) for i in zip(*aggregate_metrics["accept_counts"])]
         acceptance_probs = [i / sum(counts_aggregated) for i in counts_aggregated]
-        logging.info(f"Acceptance probs: {acceptance_probs}")
-        logging.info(
+        print(f"Acceptance probs: {acceptance_probs}")
+        print(
             f"Mean Accepted: {sum([idx * i for idx, i in enumerate(counts_aggregated)])/sum(counts_aggregated)}"
         )
 
-    logging.info(
+    print(
         f"Average tokens/sec: {torch.mean(torch.tensor(aggregate_metrics['tokens_per_sec'])).item():.2f}"
     )
-    logging.info(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
+    print(f"Memory used: {torch.cuda.max_memory_reserved() / 1e9:.02f} GB")
 
 
 def main(args):