pre-7

netdur · netdur · commit c51f7d8b0aa3 · 2024-01-31T21:38:59.000+01:00
Seed ok
top-k ok
top-p ok
min-p ok
temperature ok
grammar ko
repeat/penalty ok
logits ok
stats ko
rope ok
speculative decoding ?
cache ko
lora ok
diff --git a/example/chat.dart b/example/chat.dart
@@ -26,9 +26,10 @@ void main() async {
     contextParams.context = 512 * 4;
 
     Llama llama = Llama(
-        "/Users/adel/Workspace/llama.cpp/models/pivot-10.7b-mistral-v0.2-rp.Q5_K_S.gguf",
+        "/Users/adel/Workspace/llama.cpp/models/openhermes-2.5-neural-chat-v3-3-slerp.Q5_K_M.gguf",
         modelParams,
-        contextParams);
+        contextParams,
+        samplingParams);
 
     ChatMLFormat chatMLFormat = ChatMLFormat();
     // AlpacaFormat alpacaFormat = AlpacaFormat();
@@ -43,7 +44,7 @@ Context: Teplizumab traces its roots to a New Jersey drug company called Ortho P
 
     llama.setPrompt(system + prompt);
     while (true) {
-      var (token, done) = llama.getNext(samplingParams);
+      var (token, done) = llama.getNext();
       String? chunk = chatMLFormat.filterResponse(token);
       if (chunk != null) stdout.write(token);
       if (done) break;
@@ -53,7 +54,6 @@ Context: Teplizumab traces its roots to a New Jersey drug company called Ortho P
     llama.clear();
     stdout.write("\n");
 
-    //*
     prompt = chatMLFormat.preparePrompt("What was the company called?");
     llama.setPrompt(system + prompt);
     while (true) {
diff --git a/lib/src/llama.dart b/lib/src/llama.dart
@@ -66,19 +66,19 @@ class Llama {
   String loraBase;
   List<(String, double)> loraAdapters;
 
-  // late SamplingContext sampling;
-
   /// Constructor for Llama.
   ///
   /// Loads the model and context based on provided model and context parameters.
   Llama(String modelPath,
       [ModelParams? modelParams,
       ContextParams? contextParams,
-      this.samplingParams,
+      SamplingParams? samplingParams,
+      // this.samplingParams,
       this.loraBase = "",
       this.loraAdapters = const []])
       : modelParams = modelParams ?? ModelParams(),
-        contextParams = contextParams ?? ContextParams() {
+        contextParams = contextParams ?? ContextParams(),
+        samplingParams = samplingParams ?? SamplingParams() {
     lib.llama_backend_init(false);
     llama_model_params modelParams = this.modelParams.get();
 
@@ -118,8 +118,6 @@ class Llama {
       }
     }
     malloc.free(cLoraBase);
-
-    // sampling = SamplingContext(this);
   }
 
   /// Releases all resources associated with the Llama instance.
@@ -147,8 +145,8 @@ class Llama {
   /// An exception is thrown if the required KV cache size exceeds the context's limit.
   /// The function also initializes the batch for token processing.
   setPrompt(String prompt) {
+    // context = lib.llama_new_context_with_model(model, contextParams.get());
     tokensList = tokenize(prompt, true);
-    // temporaryInvalidCChars = [];
 
     if (length != -1) {
       int nCtx = lib.llama_n_ctx(context);
@@ -180,8 +178,6 @@ class Llama {
   /// Returns a tuple with the generated text and a boolean indicating if the end-of-sequence token is reached.
   /// An exception is thrown if llama_decode fails during processing.
   (String, bool) getNext() {
-    samplingParams ??= SamplingParams();
-
     Pointer<Int32> newTokenId = calloc.allocate<Int32>(sizeOf<Int32>());
     final nVocab = lib.llama_n_vocab(model);
     final logits = lib.llama_get_logits_ith(context, batch.n_tokens - 1);
@@ -202,33 +198,12 @@ class Llama {
       ..size = nVocab
       ..sorted = true;
 
-    /*
-    final Pointer<llama_token> nativeLastTokens =
-        malloc.allocate<llama_token>(sizeOf<llama_token>() * lastTokens.length);
-    for (int i = 0; i < lastTokens.length; i++) {
-      nativeLastTokens.elementAt(i).value = i;
-    }
-
-    Pointer<llama_sampling_params> sp = samplingParams!.get();
-    lib.llama_sample_repetition_penalties(
-        context,
-        candidatesP,
-        nativeLastTokens,
-        sp.ref.penalty_last_n,
-        sp.ref.penalty_repeat,
-        sp.ref.penalty_freq,
-        sp.ref.penalty_present);
-        */
-
     SamplingContext sampling = SamplingContext(this);
     sampling.params = samplingParams;
 
-    // int minKeep = max(1, samplingParams.nProbs);
-    // sampling.tfsZ(candidatesP, minKeep, nVocab);
     newTokenId.value = candidatesP.ref.data.elementAt(0).ref.id;
     newTokenId.value = sampling.sample(newTokenId, null);
-
-    // newTokenId.value = candidatesP.ref.data.elementAt(0).ref.id;
+    sampling.accept(newTokenId.value);
 
     // newTokenId.value = lib.llama_sample_token_greedy(context, candidatesP);
     // lastTokens.add(newTokenId);
@@ -239,13 +214,6 @@ class Llama {
 
     sampling.dispose();
 
-    if (newTokenId.value == lib.llama_token_eos(model)) {
-      int token = newTokenId.value;
-      calloc.free(newTokenId);
-      final newTokenStr = tokenToPiece(newTokenId.value);
-      return (newTokenStr, token == lib.llama_token_eos(model));
-    }
-
     final newTokenStr = tokenToPiece(newTokenId.value);
 
     batch.n_tokens = 0;
@@ -288,11 +256,8 @@ class Llama {
     lastTokens.clear();
     lib.llama_kv_cache_clear(context);
     batch.n_tokens = 0;
-    tokensList.clear();
-    lastTokens.clear();
     cursor = 0;
     decode = 0;
-    // lib.llama
   }
 
   // Utility methods
diff --git a/lib/src/sampling_context.dart b/lib/src/sampling_context.dart
@@ -6,7 +6,6 @@ import 'package:ffi/ffi.dart';
 import 'package:llama_cpp_dart/llama_cpp_dart.dart';
 
 import 'llama_cpp.dart';
-import 'sampling_params.dart';
 
 class SamplingContext {
   final List<Pointer<llama_token>> _prev = [];
@@ -257,6 +256,7 @@ class SamplingContext {
 
   accept(int id) {
     if (_prev.isNotEmpty) {
+      calloc.free(_prev[0]);
       _prev.removeAt(0);
     }
     Pointer<llama_token> idx =