improve sampling

danemadsen · danemadsen · commit 23d60c60468a · 2024-02-07T22:03:04.000+10:00
diff --git a/lib/src/llama.dart b/lib/src/llama.dart
@@ -178,53 +178,75 @@ class Llama {
   /// Returns a tuple with the generated text and a boolean indicating if the end-of-sequence token is reached.
   /// An exception is thrown if llama_decode fails during processing.
   (String, bool) getNext() {
-    Pointer<Int32> newTokenId = calloc.allocate<Int32>(sizeOf<Int32>());
+    // Allocate memory for the new token ID.
+    Pointer<Int32> newTokenId = calloc<Int32>();
+
+    // Get the number of vocabulary items.
     final int nVocab = lib.llama_n_vocab(model);
+
+    // Get the logits for the last token generated.
     final logits = lib.llama_get_logits_ith(context, batch.n_tokens - 1);
 
+    // Prepare candidates array to hold token data for all vocabulary items.
     final Pointer<llama_token_data> candidates = calloc<llama_token_data>(nVocab);
     for (int tokenId = 0; tokenId < nVocab; tokenId++) {
       candidates[tokenId].id = tokenId;
       candidates[tokenId].logit = logits[tokenId];
-      candidates[tokenId].p = 0.0;
+      candidates[tokenId].p = 0.0; // Initialize probabilities to 0.0.
     }
 
+    // Create a structure to hold the candidates array.
     final Pointer<llama_token_data_array> candidatesP = calloc<llama_token_data_array>();
     candidatesP.ref.data = candidates;
     candidatesP.ref.size = nVocab;
     candidatesP.ref.sorted = false;
 
-    SamplingContext sampling = SamplingContext(this);
-    sampling.params = samplingParams;
-
-    newTokenId.value = candidatesP.ref.data.elementAt(0).ref.id;
-    newTokenId.value = sampling.sample(newTokenId, null);
-    sampling.accept(newTokenId.value);
-
-    // newTokenId.value = lib.llama_sample_token_greedy(context, candidatesP);
-    // lastTokens.add(newTokenId);
+    // Apply sampling strategies (e.g., top-k, top-p, temperature) based on SamplingParams.
+    if (samplingParams != null) {
+      final last_tokens = calloc<Int32>(samplingParams!.nPrev);
+      lib.llama_sample_repetition_penalties(
+        context, 
+        candidatesP,
+        last_tokens,
+        samplingParams!.penaltyLastN, 
+        samplingParams!.penaltyRepeat, 
+        samplingParams!.penaltyFreq, 
+        samplingParams!.penaltyPresent
+      );
+      lib.llama_sample_top_k(context, candidatesP, samplingParams!.topK, 1);
+      lib.llama_sample_top_p(context, candidatesP, samplingParams!.topP, 1);
+      lib.llama_sample_temperature(context, candidatesP, samplingParams!.temp);
+    }
 
-    // calloc.free(nativeLastTokens);
-    calloc.free(candidates);
-    calloc.free(candidatesP);
+    // Sample a token from the adjusted logits/probabilities.
+    newTokenId.value = lib.llama_sample_token(context, candidatesP);
 
-    sampling.dispose();
+    // Check if the sampled token is an EOS token.
+    bool isEOSToken = newTokenId.value == lib.llama_token_eos(model);
 
+    // Convert the token ID to its string representation.
     final newTokenStr = tokenToPiece(newTokenId.value);
 
+    // Update the batch and context for the next token generation.
     batch.n_tokens = 0;
     batchAdd(batch, newTokenId.value, cursor, [0], true);
 
+    // Increment the counters.
     decode++;
     cursor++;
 
+    // Process the next token.
     if (lib.llama_decode(context, batch) != 0) {
-      throw Exception("failed to evaluate llama!");
+      throw Exception("Failed to evaluate Llama!");
     }
 
-    int token = newTokenId.value;
+    // Free allocated memory.
     calloc.free(newTokenId);
-    return (newTokenStr, token == lib.llama_token_eos(model));
+    calloc.free(candidates);
+    calloc.free(candidatesP);
+
+    // Return the generated text and whether the EOS token was reached.
+    return (newTokenStr, isEOSToken);
   }
 
   /// Asynchronously generates text based on a given prompt.