Refactor encoding functions to make normalization optional

monatis · monatis · commit f6f590b7aab5 · 2023-09-13T00:09:41.000+03:00
diff --git a/clip.cpp b/clip.cpp
@@ -13,7 +13,7 @@
 #define STB_IMAGE_IMPLEMENTATION
 #include "stb_image.h"
 
-// #define CLIP_DEBUG
+#define CLIP_DEBUG
 
 // utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved
 // after that, remove this and use the mechanism implemented in GGML directly
@@ -830,12 +830,13 @@ void clip_free(clip_ctx * ctx) {
     delete ctx;
 }
 
-bool clip_text_encode_c(const clip_ctx * ctx, int n_threads, const clip_tokens * tokens, float * vec) {
+bool clip_text_encode_c(const clip_ctx * ctx, int n_threads, const clip_tokens * tokens, float * vec, const bool normalize) {
     std::vector<int> _tokens(tokens->data, tokens->data + tokens->size);
-    return clip_text_encode(ctx, n_threads, _tokens, vec);
+    return clip_text_encode(ctx, n_threads, _tokens, vec, normalize);
 }
 
-bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_vocab::id> & tokens, float * vec) {
+bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_vocab::id> & tokens, float * vec,
+                      const bool normalize) {
     const auto & model = ctx->text_model;
     const auto & hparams = model.hparams;
     const int N = tokens.size();
@@ -975,8 +976,10 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
     embeddings = ggml_mul_mat(ctx0, model.projection, embeddings);
 
     // normalize output embeddings
-    ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embeddings)));
-    embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+    if (normalize) {
+        ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embeddings)));
+        embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+    }
 
     ggml_set_name(embeddings, "check");
 
@@ -1045,17 +1048,18 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
     return true;
 }
 
-bool clip_image_encode_c(const clip_ctx * ctx, int n_threads, const clip_image_f32 * img, float * vec) {
-    return clip_image_encode(ctx, n_threads, *img, vec);
+bool clip_image_encode_c(const clip_ctx * ctx, int n_threads, const clip_image_f32 * img, float * vec, const bool normalize) {
+    return clip_image_encode(ctx, n_threads, *img, vec, normalize);
 }
 
-bool clip_image_encode(const clip_ctx * ctx, int n_threads, const clip_image_f32 & img, float * vec) {
+bool clip_image_encode(const clip_ctx * ctx, int n_threads, const clip_image_f32 & img, float * vec, const bool normalize) {
     std::vector<clip_image_f32> imgs;
     imgs.push_back(img);
-    return clip_image_batch_encode(ctx, n_threads, imgs, vec);
+    return clip_image_batch_encode(ctx, n_threads, imgs, vec, normalize);
 }
 
-bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_image_f32> & imgs, float * vec) {
+bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_image_f32> & imgs, float * vec,
+                             const bool normalize) {
     const auto & model = ctx->vision_model;
     const auto & hparams = model.hparams;
 
@@ -1251,8 +1255,10 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
 
     for (int b = 0; b < batch_size; b++) {
         struct ggml_tensor * embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b));
-        ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
-        embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+        if (normalize) {
+            ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
+            embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
+        }
         output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding));
     }
     ggml_set_name(output, "check");
@@ -1330,10 +1336,7 @@ float clip_similarity_score(float * vec1, float * vec2, int vec_dim) {
         dot_product += vec1[i] * vec2[i];
     }
 
-    // Clamp the dot product to the range [0, 1].
-    float clamped_dot_product = fmin(fmax(dot_product, 0.0), 1.0);
-
-    return clamped_dot_product;
+    return dot_product;
 }
 
 bool clip_compare_text_and_image_c(clip_ctx * ctx, int n_threads, char * text, clip_image_u8 * image, float * score) {
@@ -1354,14 +1357,14 @@ bool clip_compare_text_and_image(clip_ctx * ctx, int n_threads, std::string & te
         return false;
     }
 
-    if (!clip_image_encode(ctx, n_threads, img_res, img_vec)) {
+    if (!clip_image_encode(ctx, n_threads, img_res, img_vec, true)) {
         return false;
     }
 
     // tokenize and encode text
     auto tokens = clip_tokenize(ctx, text);
 
-    if (!clip_text_encode(ctx, n_threads, tokens, txt_vec)) {
+    if (!clip_text_encode(ctx, n_threads, tokens, txt_vec, true)) {
         return false;
     }
 
@@ -1396,18 +1399,10 @@ bool softmax_with_sorting(float * arr, int length, float * sorted_scores, int *
     }
 
     // Calculate softmax probabilities
-    /*
-    float max_val = arr[0];
-    for (int i = 1; i < length; i++) {
-        if (arr[i] > max_val) {
-            max_val = arr[i];
-        }
-    }
-*/
 
-    float sum = 0.0;
+    double sum = 0.0;
     for (int i = 0; i < length; i++) {
-        arr[i] = exp(arr[i]);
+        arr[i] = exp(arr[i]) + 1e-9;
         sum += arr[i];
     }
 
diff --git a/clip.h b/clip.h
@@ -224,8 +224,10 @@ struct clip_image_f32 * make_clip_image_f32();
 bool clip_image_load_from_file_c(const char * fname, struct clip_image_u8 * img);
 bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res);
 
-bool clip_text_encode_c(const struct clip_ctx * ctx, int n_threads, const struct clip_tokens * tokens, float * vec);
-bool clip_image_encode_c(const struct clip_ctx * ctx, int n_threads, const struct clip_image_f32 * img, float * vec);
+bool clip_text_encode_c(const struct clip_ctx * ctx, int n_threads, const struct clip_tokens * tokens, float * vec,
+                        const bool normalize);
+bool clip_image_encode_c(const struct clip_ctx * ctx, int n_threads, const struct clip_image_f32 * img, float * vec,
+                         const bool normalize);
 
 // bool image_normalize(clip_image_u8 *img, clip_image_f32 *res);
 
@@ -241,13 +243,16 @@ std::vector<clip_vocab::id> clip_tokenize(const clip_ctx * ctx, const std::strin
 
 bool clip_image_load_from_file(const std::string & fname, clip_image_u8 & img);
 
-bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_vocab::id> & tokens, float * vec);
-bool clip_image_encode(const struct clip_ctx * ctx, int n_threads, const struct clip_image_f32 & img, float * vec);
+bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_vocab::id> & tokens, float * vec,
+                      const bool normalize);
+bool clip_image_encode(const struct clip_ctx * ctx, int n_threads, const struct clip_image_f32 & img, float * vec,
+                       const bool normalize);
 
 bool clip_compare_text_and_image(clip_ctx * ctx, int n_threads, std::string & text, clip_image_u8 & image, float * score);
 
 // TODO clip_image_batch_encode_c
-bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_image_f32> & imgs, float * vec);
+bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_image_f32> & imgs, float * vec,
+                             const bool normalize);
 
 // TODO clip_image_batch_preprocess_c
 void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, const std::vector<clip_image_u8> & img_inputs,
diff --git a/examples/image-search/build.cpp b/examples/image-search/build.cpp
@@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
 
                 clip_image_batch_preprocess(clip_ctx, params.n_threads, img_inputs, imgs_resized);
 
-                clip_image_batch_encode(clip_ctx, params.n_threads, imgs_resized, vec.data());
+                clip_image_batch_encode(clip_ctx, params.n_threads, imgs_resized, vec.data(), true);
 
                 // add image vectors to the database
                 for (size_t b = 0; b < batch_size; b++) {
@@ -162,7 +162,7 @@ int main(int argc, char ** argv) {
                 }
 
                 clip_image_batch_preprocess(clip_ctx, params.n_threads, img_inputs, imgs_resized);
-                clip_image_batch_encode(clip_ctx, params.n_threads, imgs_resized, vec.data());
+                clip_image_batch_encode(clip_ctx, params.n_threads, imgs_resized, vec.data(), true);
 
                 // add image vectors to the database
                 for (size_t l = 0; l < leftover; l++) {
diff --git a/examples/image-search/search.cpp b/examples/image-search/search.cpp
@@ -142,7 +142,7 @@ int main(int argc, char ** argv) {
         clip_image_f32 img_res;
         clip_image_preprocess(clip_ctx, &img0, &img_res);
 
-        if (!clip_image_encode(clip_ctx, params.n_threads, img_res, vec.data())) {
+        if (!clip_image_encode(clip_ctx, params.n_threads, img_res, vec.data(), true)) {
             fprintf(stderr, "%s: failed to encode image from '%s'\n", __func__, params.img_path.c_str());
             clip_free(clip_ctx);
             return 1;
@@ -151,7 +151,7 @@ int main(int argc, char ** argv) {
 
         auto tokens = clip_tokenize(clip_ctx, params.search_text);
 
-        clip_text_encode(clip_ctx, params.n_threads, tokens, vec.data());
+        clip_text_encode(clip_ctx, params.n_threads, tokens, vec.data(), true);
     }
 
     auto results = embd_index.search({vec.data(), vec.size()}, params.n_results);
diff --git a/examples/main.c b/examples/main.c
@@ -1,15 +1,16 @@
 #include "clip.h"
-#include "stdio.h"
+#include <stdbool.h>
+#include <stdio.h>
 
 int main() {
-    char *model_path = "../../models/openai_clip-vit-base-patch32.ggmlv0.q4_1.bin";
-    char *img_path = "../../tests/red_apple.jpg";
-    char *text = "an apple";
+    char * model_path = "../../models/openai_clip-vit-base-patch32.ggmlv0.q4_1.bin";
+    char * img_path = "../../tests/red_apple.jpg";
+    char * text = "an apple";
     int n_threads = 4;
     int verbosity = 1;
 
     // Load CLIP model
-    struct clip_ctx *ctx = clip_model_load(model_path, verbosity);
+    struct clip_ctx * ctx = clip_model_load(model_path, verbosity);
     if (!ctx) {
         printf("%s: Unable  to load model from %s", __func__, model_path);
         return 1;
@@ -19,22 +20,22 @@ int main() {
     int vec_dim = 512;
 
     // Load image from disk
-    struct clip_image_u8 *img0 = make_clip_image_u8();
+    struct clip_image_u8 * img0 = make_clip_image_u8();
     if (!clip_image_load_from_file_c(img_path, img0)) {
         fprintf(stderr, "%s: failed to load image from '%s'\n", __func__, img_path);
         return 1;
     }
 
     // Preprocess image
-    struct clip_image_f32 *img_res = make_clip_image_f32();
+    struct clip_image_f32 * img_res = make_clip_image_f32();
     if (!clip_image_preprocess(ctx, img0, img_res)) {
         fprintf(stderr, "%s: failed to preprocess image\n", __func__);
         return 1;
     }
 
     // Encode image
     float img_vec[vec_dim];
-    if (!clip_image_encode_c(ctx, n_threads, img_res, img_vec)) {
+    if (!clip_image_encode_c(ctx, n_threads, img_res, img_vec, true)) {
         fprintf(stderr, "%s: failed to encode image\n", __func__);
         return 1;
     }
@@ -44,7 +45,7 @@ int main() {
 
     // Encode text
     float txt_vec[vec_dim];
-    if (!clip_text_encode_c(ctx, n_threads, &tokens, txt_vec)) {
+    if (!clip_text_encode_c(ctx, n_threads, &tokens, txt_vec, true)) {
         fprintf(stderr, "%s: failed to encode text\n", __func__);
         return 1;
     }
diff --git a/examples/python_bindings/clip_cpp/clip.py b/examples/python_bindings/clip_cpp/clip.py
@@ -179,6 +179,7 @@ class ClipContext(ctypes.Structure):
     ctypes.c_int,
     ctypes.POINTER(ClipTokens),
     ctypes.POINTER(ctypes.c_float),
+    ctypes.c_bool,
 ]
 clip_text_encode.restype = ctypes.c_bool
 
@@ -188,6 +189,7 @@ class ClipContext(ctypes.Structure):
     ctypes.c_int,
     ctypes.POINTER(ClipImageF32),
     ctypes.POINTER(ctypes.c_float),
+    ctypes.c_bool,
 ]
 clip_image_encode.restype = ctypes.c_bool
 
@@ -258,22 +260,25 @@ def tokenize(self, text: str) -> List[int]:
         return [tokens.data[i] for i in range(tokens.size)]
 
     def encode_text(
-        self, tokens: List[int], n_threads: int = os.cpu_count()
+        self,
+        tokens: List[int],
+        n_threads: int = os.cpu_count(),
+        normalize: bool = True,
     ) -> List[float]:
         tokens_array = (ClipVocabId * len(tokens))(*tokens)
         clip_tokens = ClipTokens(data=tokens_array, size=len(tokens))
 
         txt_vec = (ctypes.c_float * self.vec_dim)()
 
         if not clip_text_encode(
-            self.ctx, n_threads, ctypes.pointer(clip_tokens), txt_vec
+            self.ctx, n_threads, ctypes.pointer(clip_tokens), txt_vec, normalize
         ):
             raise RuntimeError("Could not encode text")
 
         return [txt_vec[i] for i in range(self.vec_dim)]
 
     def load_preprocess_encode_image(
-        self, image_path: str, n_threads: int = os.cpu_count()
+        self, image_path: str, n_threads: int = os.cpu_count(), normalize: bool = true
     ) -> List[float]:
         image_ptr = make_clip_image_u8()
         if not clip_image_load_from_file(image_path.encode("utf8"), image_ptr):
@@ -284,7 +289,9 @@ def load_preprocess_encode_image(
             raise RuntimeError("Could not preprocess image")
 
         img_vec = (ctypes.c_float * self.vec_dim)()
-        if not clip_image_encode(self.ctx, n_threads, processed_image_ptr, img_vec):
+        if not clip_image_encode(
+            self.ctx, n_threads, processed_image_ptr, img_vec, normalize
+        ):
             raise RuntimeError("Could not encode image")
 
         return [img_vec[i] for i in range(self.vec_dim)]
diff --git a/examples/zsl.cpp b/examples/zsl.cpp
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
     clip_image_preprocess(ctx, &img0, &img_res);
 
     float img_vec[vec_dim];
-    if (!clip_image_encode(ctx, params.n_threads, img_res, img_vec)) {
+    if (!clip_image_encode(ctx, params.n_threads, img_res, img_vec, false)) {
         return 1;
     }
 
@@ -45,7 +45,7 @@ int main(int argc, char ** argv) {
 
     for (int i = 0; i < n_labels; i++) {
         auto tokens = clip_tokenize(ctx, params.texts[i]);
-        clip_text_encode(ctx, params.n_threads, tokens, txt_vec);
+        clip_text_encode(ctx, params.n_threads, tokens, txt_vec, false);
         similarities[i] = clip_similarity_score(img_vec, txt_vec, vec_dim);
     }
 
@@ -63,4 +63,4 @@ int main(int argc, char ** argv) {
     clip_free(ctx);
 
     return 0;
-}
+}
diff --git a/tests/benchmark.cpp b/tests/benchmark.cpp
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
 
     for (const auto & entry : result) {
         auto tokens = clip_tokenize(ctx, entry.first);
-        if (!clip_text_encode(ctx, n_threads, tokens, txt_vecs + label_idx * vec_dim)) {
+        if (!clip_text_encode(ctx, n_threads, tokens, txt_vecs + label_idx * vec_dim, true)) {
             printf("%s: Could not encode the label at index %d: %s\n", __func__, label_idx, entry.first.c_str());
             return 1;
         }
@@ -111,7 +111,7 @@ int main(int argc, char ** argv) {
 
             clip_image_batch_preprocess(ctx, n_threads, img_inputs, imgs_resized);
 
-            clip_image_batch_encode(ctx, n_threads, imgs_resized, img_vecs);
+            clip_image_batch_encode(ctx, n_threads, imgs_resized, img_vecs, true);
 
             for (size_t b = 0; b < batch_size; b++) {
                 for (size_t j = 0; j < n_labels; j++) {