ggml-org
diff --git a/‎.github/workflows/close-issue.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/close-issue.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/common.h
Lines changed: 1 addition & 1 deletion b/‎common/common.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎common/log.h
Lines changed: 2 additions & 2 deletions b/‎common/log.h
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/main/main.cpp
Lines changed: 1 addition & 1 deletion b/‎examples/main/main.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/perplexity/README.md
Lines changed: 115 additions & 3 deletions b/‎examples/perplexity/README.md
Lines changed: 115 additions & 3 deletions
diff --git a/‎examples/perplexity/perplexity.cpp
Lines changed: 175 additions & 55 deletions b/‎examples/perplexity/perplexity.cpp
Lines changed: 175 additions & 55 deletions
diff --git a/‎examples/server/tests/features/results.feature
Lines changed: 56 additions & 32 deletions b/‎examples/server/tests/features/results.feature
Lines changed: 56 additions & 32 deletions
@@ -12,7 +12,7 @@ jobs:
     steps:
       - uses: actions/stale@v5
         with:
-          exempt-issue-labels: "refactor,help wanted,good first issue,research"
+          exempt-issue-labels: "refactor,help wanted,good first issue,research,bug"
           days-before-issue-stale: 30
           days-before-issue-close: 14
           stale-issue-label: "stale"
 
@@ -135,7 +135,7 @@ struct gpt_params {
     bool   multiple_choice = false; // compute TruthfulQA score over random tasks from datafile supplied in prompt
     size_t multiple_choice_tasks = 0;     // number of tasks to use when computing the TruthfulQA score. If 0, all tasks will be computed
 
-    bool   kl_divergence   = false; // compute KL-divergence
+    bool   kl_divergence   = false; // compute KL divergence
 
     bool random_prompt     = false; // do not randomize prompt if none provided
     bool use_color         = false; // use color to distinguish generations and inputs
 
@@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
     #define LOG_IMPL(str, ...)                                                                                      \
     do {                                                                                                            \
         if (LOG_TARGET != nullptr)                                                                                  \
@@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
-#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
+#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
     #define LOG_TEE_IMPL(str, ...)                                                                                                      \
     do {                                                                                                                                \
         if (LOG_TARGET != nullptr)                                                                                                      \
 
@@ -544,7 +544,7 @@ int main(int argc, char ** argv) {
                 // if we run out of context:
                 // - take the n_keep first tokens from the original prompt (via n_past)
                 // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
-                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
+                if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
                     if (params.n_predict == -2) {
                         LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
                         break;
 
@@ -7,44 +7,16 @@ Feature: Results
     And   a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
     And   a model file test-model-00001-of-00003.gguf
     And   128 as batch size
-    And   256 KV cache size
+    And   1024 KV cache size
     And   128 max tokens to predict
+    And   continuous batching
 
-  Scenario Outline: Multi users completion
+  Scenario Outline: consistent results with same seed
     Given <n_slots> slots
-    And   continuous batching
     Then  the server is starting
     Then  the server is healthy
 
-    Given 42 as seed
-    And a prompt:
-      """
-      Write a very long story about AI.
-      """
-
-    Given 42 as seed
-    And a prompt:
-      """
-      Write a very long story about AI.
-      """
-
-    Given 42 as seed
-    And a prompt:
-      """
-      Write a very long story about AI.
-      """
-
-    Given 42 as seed
-    And a prompt:
-      """
-      Write a very long story about AI.
-      """
-
-    Given 42 as seed
-    And a prompt:
-      """
-      Write a very long story about AI.
-      """
+    Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
 
     Given concurrent completion requests
     Then the server is busy
@@ -55,3 +27,55 @@ Feature: Results
       | n_slots |
       | 1       |
       | 2       |
+
+  Scenario Outline: different results with different seed
+    Given <n_slots> slots
+    Then  the server is starting
+    Then  the server is healthy
+
+    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
+    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43
+    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44
+    Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45
+
+    Given concurrent completion requests
+    Then the server is busy
+    Then the server is idle
+    And  all slots are idle
+    Then all predictions are different
+    Examples:
+      | n_slots |
+      | 1       |
+      | 2       |
+
+  Scenario Outline: consistent results with same seed and varying batch size
+    Given 4 slots
+    And   <temp> temperature
+    # And   0 as draft
+    Then  the server is starting
+    Then  the server is healthy
+
+    Given 1 prompts "Write a very long story about AI." with seed 42
+    And   concurrent completion requests
+    # Then the server is busy # Not all slots will be utilized.
+    Then  the server is idle
+    And   all slots are idle
+
+    Given <n_parallel> prompts "Write a very long story about AI." with seed 42
+    And   concurrent completion requests
+    # Then the server is busy # Not all slots will be utilized.
+    Then the server is idle
+    And  all slots are idle
+
+    Then all predictions are equal
+    Examples:
+      | n_parallel | temp |
+      |  1         | 0.0  |
+      |  2         | 0.0  |
+      |  4         | 0.0  |
+      |  1         | 1.0  |
+      # FIXME: These tests fail on master. The problem seems to be the unified KV cache.
+      # See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
+      # and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
+      # |  2         | 1.0  |
+      # |  4         | 1.0  |