server : normalize embeddings (ggml-org#5956)

redlion0929 · ggerganov · hazelnutcloud · commit 373f9169ca9e · 2024-03-10T15:38:20.000+08:00
* output normalize embedding in '/v1/embeddings'

* common : reuse llama_embd_normalize

* common : better normalize impl

---------

Co-authored-by: Georgi Gerganov &lt;ggerganov@gmail.com&gt;
diff --git a/common/common.cpp b/common/common.cpp
@@ -1852,3 +1852,18 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
 
     printf("\n=== Done dumping\n");
 }
+
+void llama_embd_normalize(const float * inp, float * out, int n) {
+    double sum = 0.0;
+    for (int i = 0; i < n; i++) {
+        sum += inp[i] * inp[i];
+    }
+    sum = sqrt(sum);
+
+    const float norm = sum > 0.0 ? 1.0f / sum : 0.0f;
+
+    for (int i = 0; i < n; i++) {
+        out[i] = inp[i] * norm;
+    }
+}
+
diff --git a/common/common.h b/common/common.h
@@ -260,3 +260,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
 
 // Dump the KV cache view showing individual sequences in each cell (long output).
 void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+
+//
+// Embedding utils
+//
+
+void llama_embd_normalize(const float * inp, float * out, int n);
+
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -23,17 +23,6 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
     }
 }
 
-static void normalize(const float * vec, float * out, int n) {
-    float norm = 0;
-    for (int i = 0; i < n; i++) {
-        norm += vec[i] * vec[i];
-    }
-    norm = sqrt(norm);
-    for (int i = 0; i < n; i++) {
-        out[i] = vec[i] / norm;
-    }
-}
-
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
     // clear previous kv_cache values (irrelevant for embeddings)
     llama_kv_cache_clear(ctx);
@@ -44,7 +33,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         fprintf(stderr, "%s : failed to decode\n", __func__);
     }
 
-    // normalize on copy
     for (int i = 0; i < batch.n_tokens; i++) {
         if (!batch.logits[i]) {
             continue;
@@ -61,7 +49,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
         }
 
         float * out = output + batch.seq_id[i][0] * n_embd;
-        normalize(embd, out, n_embd);
+        llama_embd_normalize(embd, out, n_embd);
     }
 }
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -1327,6 +1327,8 @@ struct server_context {
 
         const int n_embd = llama_n_embd(model);
 
+        std::vector<float> embd_res(n_embd, 0.0f);
+
         for (int i = 0; i < batch.n_tokens; ++i) {
             if (!batch.logits[i] || batch.seq_id[i][0] != slot.id + 1) {
                 continue;
@@ -1350,8 +1352,10 @@ struct server_context {
                 continue;
             }
 
+            llama_embd_normalize(embd, embd_res.data(), n_embd);
+
             res.data = json {
-                {"embedding", std::vector<float>(embd, embd + n_embd)},
+                {"embedding", embd_res},
             };
         }
 
@@ -3354,6 +3358,8 @@ int main(int argc, char ** argv) {
             // get the result
             server_task_result result = ctx_server.queue_results.recv(id_task);
             ctx_server.queue_results.remove_waiting_task_id(id_task);
+
+            // append to the responses
             responses.push_back(result.data);
         }
 

Original file line number	Diff line number	Diff line change
`@@ -23,17 +23,6 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke`
`23`	`23`	`}`
`24`	`24`	`}`
`25`	`25`
`26`		`-static void normalize(const float * vec, float * out, int n) {`
`27`		`- float norm = 0;`
`28`		`- for (int i = 0; i < n; i++) {`
`29`		`- norm += vec[i] * vec[i];`
`30`		`- }`
`31`		`- norm = sqrt(norm);`
`32`		`- for (int i = 0; i < n; i++) {`
`33`		`- out[i] = vec[i] / norm;`
`34`		`- }`
`35`		`-}`
`36`		`-`
`37`	`26`	`static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {`
`38`	`27`	`// clear previous kv_cache values (irrelevant for embeddings)`
`39`	`28`	`llama_kv_cache_clear(ctx);`
`@@ -44,7 +33,6 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu`
`44`	`33`	`fprintf(stderr, "%s : failed to decode\n", __func__);`
`45`	`34`	`}`
`46`	`35`
`47`		`- // normalize on copy`
`48`	`36`	`for (int i = 0; i < batch.n_tokens; i++) {`
`49`	`37`	`if (!batch.logits[i]) {`
`50`	`38`	`continue;`
`@@ -61,7 +49,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu`
`61`	`49`	`}`
`62`	`50`
`63`	`51`	`float * out = output + batch.seq_id[i][0] * n_embd;`
`64`		`- normalize(embd, out, n_embd);`
	`52`	`+ llama_embd_normalize(embd, out, n_embd);`
`65`	`53`	`}`
`66`	`54`	`}`
`67`	`55`