Skip to content

Commit f6f590b

Browse files
committed
Refactor encoding functions to make normalization optional
1 parent f6a4789 commit f6f590b

File tree

8 files changed

+64
-56
lines changed

8 files changed

+64
-56
lines changed

clip.cpp

Lines changed: 24 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
#define STB_IMAGE_IMPLEMENTATION
1414
#include "stb_image.h"
1515

16-
// #define CLIP_DEBUG
16+
#define CLIP_DEBUG
1717

1818
// utility function for a workaround until https://github.com/ggerganov/ggml/issues/260 is resolved
1919
// after that, remove this and use the mechanism implemented in GGML directly
@@ -830,12 +830,13 @@ void clip_free(clip_ctx * ctx) {
830830
delete ctx;
831831
}
832832

833-
bool clip_text_encode_c(const clip_ctx * ctx, int n_threads, const clip_tokens * tokens, float * vec) {
833+
bool clip_text_encode_c(const clip_ctx * ctx, int n_threads, const clip_tokens * tokens, float * vec, const bool normalize) {
834834
std::vector<int> _tokens(tokens->data, tokens->data + tokens->size);
835-
return clip_text_encode(ctx, n_threads, _tokens, vec);
835+
return clip_text_encode(ctx, n_threads, _tokens, vec, normalize);
836836
}
837837

838-
bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_vocab::id> & tokens, float * vec) {
838+
bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_vocab::id> & tokens, float * vec,
839+
const bool normalize) {
839840
const auto & model = ctx->text_model;
840841
const auto & hparams = model.hparams;
841842
const int N = tokens.size();
@@ -975,8 +976,10 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
975976
embeddings = ggml_mul_mat(ctx0, model.projection, embeddings);
976977

977978
// normalize output embeddings
978-
ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embeddings)));
979-
embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
979+
if (normalize) {
980+
ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embeddings)));
981+
embeddings = ggml_scale_inplace(ctx0, embeddings, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
982+
}
980983

981984
ggml_set_name(embeddings, "check");
982985

@@ -1045,17 +1048,18 @@ bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<cli
10451048
return true;
10461049
}
10471050

1048-
bool clip_image_encode_c(const clip_ctx * ctx, int n_threads, const clip_image_f32 * img, float * vec) {
1049-
return clip_image_encode(ctx, n_threads, *img, vec);
1051+
bool clip_image_encode_c(const clip_ctx * ctx, int n_threads, const clip_image_f32 * img, float * vec, const bool normalize) {
1052+
return clip_image_encode(ctx, n_threads, *img, vec, normalize);
10501053
}
10511054

1052-
bool clip_image_encode(const clip_ctx * ctx, int n_threads, const clip_image_f32 & img, float * vec) {
1055+
bool clip_image_encode(const clip_ctx * ctx, int n_threads, const clip_image_f32 & img, float * vec, const bool normalize) {
10531056
std::vector<clip_image_f32> imgs;
10541057
imgs.push_back(img);
1055-
return clip_image_batch_encode(ctx, n_threads, imgs, vec);
1058+
return clip_image_batch_encode(ctx, n_threads, imgs, vec, normalize);
10561059
}
10571060

1058-
bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_image_f32> & imgs, float * vec) {
1061+
bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_image_f32> & imgs, float * vec,
1062+
const bool normalize) {
10591063
const auto & model = ctx->vision_model;
10601064
const auto & hparams = model.hparams;
10611065

@@ -1251,8 +1255,10 @@ bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vec
12511255

12521256
for (int b = 0; b < batch_size; b++) {
12531257
struct ggml_tensor * embedding = ggml_get_rows(ctx0, embeddings, ggml_new_i32(ctx0, b));
1254-
ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
1255-
embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
1258+
if (normalize) {
1259+
ggml_tensor * length = ggml_sqrt(ctx0, ggml_sum(ctx0, ggml_sqr(ctx0, embedding)));
1260+
embedding = ggml_scale_inplace(ctx0, embedding, ggml_div(ctx0, ggml_new_f32(ctx0, 1.0f), length));
1261+
}
12561262
output = ggml_acc(ctx0, output, embedding, output->nb[1], output->nb[2], output->nb[3], b * ggml_nbytes(embedding));
12571263
}
12581264
ggml_set_name(output, "check");
@@ -1330,10 +1336,7 @@ float clip_similarity_score(float * vec1, float * vec2, int vec_dim) {
13301336
dot_product += vec1[i] * vec2[i];
13311337
}
13321338

1333-
// Clamp the dot product to the range [0, 1].
1334-
float clamped_dot_product = fmin(fmax(dot_product, 0.0), 1.0);
1335-
1336-
return clamped_dot_product;
1339+
return dot_product;
13371340
}
13381341

13391342
bool clip_compare_text_and_image_c(clip_ctx * ctx, int n_threads, char * text, clip_image_u8 * image, float * score) {
@@ -1354,14 +1357,14 @@ bool clip_compare_text_and_image(clip_ctx * ctx, int n_threads, std::string & te
13541357
return false;
13551358
}
13561359

1357-
if (!clip_image_encode(ctx, n_threads, img_res, img_vec)) {
1360+
if (!clip_image_encode(ctx, n_threads, img_res, img_vec, true)) {
13581361
return false;
13591362
}
13601363

13611364
// tokenize and encode text
13621365
auto tokens = clip_tokenize(ctx, text);
13631366

1364-
if (!clip_text_encode(ctx, n_threads, tokens, txt_vec)) {
1367+
if (!clip_text_encode(ctx, n_threads, tokens, txt_vec, true)) {
13651368
return false;
13661369
}
13671370

@@ -1396,18 +1399,10 @@ bool softmax_with_sorting(float * arr, int length, float * sorted_scores, int *
13961399
}
13971400

13981401
// Calculate softmax probabilities
1399-
/*
1400-
float max_val = arr[0];
1401-
for (int i = 1; i < length; i++) {
1402-
if (arr[i] > max_val) {
1403-
max_val = arr[i];
1404-
}
1405-
}
1406-
*/
14071402

1408-
float sum = 0.0;
1403+
double sum = 0.0;
14091404
for (int i = 0; i < length; i++) {
1410-
arr[i] = exp(arr[i]);
1405+
arr[i] = exp(arr[i]) + 1e-9;
14111406
sum += arr[i];
14121407
}
14131408

clip.h

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -224,8 +224,10 @@ struct clip_image_f32 * make_clip_image_f32();
224224
bool clip_image_load_from_file_c(const char * fname, struct clip_image_u8 * img);
225225
bool clip_image_preprocess(const struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32 * res);
226226

227-
bool clip_text_encode_c(const struct clip_ctx * ctx, int n_threads, const struct clip_tokens * tokens, float * vec);
228-
bool clip_image_encode_c(const struct clip_ctx * ctx, int n_threads, const struct clip_image_f32 * img, float * vec);
227+
bool clip_text_encode_c(const struct clip_ctx * ctx, int n_threads, const struct clip_tokens * tokens, float * vec,
228+
const bool normalize);
229+
bool clip_image_encode_c(const struct clip_ctx * ctx, int n_threads, const struct clip_image_f32 * img, float * vec,
230+
const bool normalize);
229231

230232
// bool image_normalize(clip_image_u8 *img, clip_image_f32 *res);
231233

@@ -241,13 +243,16 @@ std::vector<clip_vocab::id> clip_tokenize(const clip_ctx * ctx, const std::strin
241243

242244
bool clip_image_load_from_file(const std::string & fname, clip_image_u8 & img);
243245

244-
bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_vocab::id> & tokens, float * vec);
245-
bool clip_image_encode(const struct clip_ctx * ctx, int n_threads, const struct clip_image_f32 & img, float * vec);
246+
bool clip_text_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_vocab::id> & tokens, float * vec,
247+
const bool normalize);
248+
bool clip_image_encode(const struct clip_ctx * ctx, int n_threads, const struct clip_image_f32 & img, float * vec,
249+
const bool normalize);
246250

247251
bool clip_compare_text_and_image(clip_ctx * ctx, int n_threads, std::string & text, clip_image_u8 & image, float * score);
248252

249253
// TODO clip_image_batch_encode_c
250-
bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_image_f32> & imgs, float * vec);
254+
bool clip_image_batch_encode(const clip_ctx * ctx, int n_threads, const std::vector<clip_image_f32> & imgs, float * vec,
255+
const bool normalize);
251256

252257
// TODO clip_image_batch_preprocess_c
253258
void clip_image_batch_preprocess(const clip_ctx * ctx, const int n_threads, const std::vector<clip_image_u8> & img_inputs,

examples/image-search/build.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ int main(int argc, char ** argv) {
126126

127127
clip_image_batch_preprocess(clip_ctx, params.n_threads, img_inputs, imgs_resized);
128128

129-
clip_image_batch_encode(clip_ctx, params.n_threads, imgs_resized, vec.data());
129+
clip_image_batch_encode(clip_ctx, params.n_threads, imgs_resized, vec.data(), true);
130130

131131
// add image vectors to the database
132132
for (size_t b = 0; b < batch_size; b++) {
@@ -162,7 +162,7 @@ int main(int argc, char ** argv) {
162162
}
163163

164164
clip_image_batch_preprocess(clip_ctx, params.n_threads, img_inputs, imgs_resized);
165-
clip_image_batch_encode(clip_ctx, params.n_threads, imgs_resized, vec.data());
165+
clip_image_batch_encode(clip_ctx, params.n_threads, imgs_resized, vec.data(), true);
166166

167167
// add image vectors to the database
168168
for (size_t l = 0; l < leftover; l++) {

examples/image-search/search.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ int main(int argc, char ** argv) {
142142
clip_image_f32 img_res;
143143
clip_image_preprocess(clip_ctx, &img0, &img_res);
144144

145-
if (!clip_image_encode(clip_ctx, params.n_threads, img_res, vec.data())) {
145+
if (!clip_image_encode(clip_ctx, params.n_threads, img_res, vec.data(), true)) {
146146
fprintf(stderr, "%s: failed to encode image from '%s'\n", __func__, params.img_path.c_str());
147147
clip_free(clip_ctx);
148148
return 1;
@@ -151,7 +151,7 @@ int main(int argc, char ** argv) {
151151

152152
auto tokens = clip_tokenize(clip_ctx, params.search_text);
153153

154-
clip_text_encode(clip_ctx, params.n_threads, tokens, vec.data());
154+
clip_text_encode(clip_ctx, params.n_threads, tokens, vec.data(), true);
155155
}
156156

157157
auto results = embd_index.search({vec.data(), vec.size()}, params.n_results);

examples/main.c

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
#include "clip.h"
2-
#include "stdio.h"
2+
#include <stdbool.h>
3+
#include <stdio.h>
34

45
int main() {
5-
char *model_path = "../../models/openai_clip-vit-base-patch32.ggmlv0.q4_1.bin";
6-
char *img_path = "../../tests/red_apple.jpg";
7-
char *text = "an apple";
6+
char * model_path = "../../models/openai_clip-vit-base-patch32.ggmlv0.q4_1.bin";
7+
char * img_path = "../../tests/red_apple.jpg";
8+
char * text = "an apple";
89
int n_threads = 4;
910
int verbosity = 1;
1011

1112
// Load CLIP model
12-
struct clip_ctx *ctx = clip_model_load(model_path, verbosity);
13+
struct clip_ctx * ctx = clip_model_load(model_path, verbosity);
1314
if (!ctx) {
1415
printf("%s: Unable to load model from %s", __func__, model_path);
1516
return 1;
@@ -19,22 +20,22 @@ int main() {
1920
int vec_dim = 512;
2021

2122
// Load image from disk
22-
struct clip_image_u8 *img0 = make_clip_image_u8();
23+
struct clip_image_u8 * img0 = make_clip_image_u8();
2324
if (!clip_image_load_from_file_c(img_path, img0)) {
2425
fprintf(stderr, "%s: failed to load image from '%s'\n", __func__, img_path);
2526
return 1;
2627
}
2728

2829
// Preprocess image
29-
struct clip_image_f32 *img_res = make_clip_image_f32();
30+
struct clip_image_f32 * img_res = make_clip_image_f32();
3031
if (!clip_image_preprocess(ctx, img0, img_res)) {
3132
fprintf(stderr, "%s: failed to preprocess image\n", __func__);
3233
return 1;
3334
}
3435

3536
// Encode image
3637
float img_vec[vec_dim];
37-
if (!clip_image_encode_c(ctx, n_threads, img_res, img_vec)) {
38+
if (!clip_image_encode_c(ctx, n_threads, img_res, img_vec, true)) {
3839
fprintf(stderr, "%s: failed to encode image\n", __func__);
3940
return 1;
4041
}
@@ -44,7 +45,7 @@ int main() {
4445

4546
// Encode text
4647
float txt_vec[vec_dim];
47-
if (!clip_text_encode_c(ctx, n_threads, &tokens, txt_vec)) {
48+
if (!clip_text_encode_c(ctx, n_threads, &tokens, txt_vec, true)) {
4849
fprintf(stderr, "%s: failed to encode text\n", __func__);
4950
return 1;
5051
}

examples/python_bindings/clip_cpp/clip.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ class ClipContext(ctypes.Structure):
179179
ctypes.c_int,
180180
ctypes.POINTER(ClipTokens),
181181
ctypes.POINTER(ctypes.c_float),
182+
ctypes.c_bool,
182183
]
183184
clip_text_encode.restype = ctypes.c_bool
184185

@@ -188,6 +189,7 @@ class ClipContext(ctypes.Structure):
188189
ctypes.c_int,
189190
ctypes.POINTER(ClipImageF32),
190191
ctypes.POINTER(ctypes.c_float),
192+
ctypes.c_bool,
191193
]
192194
clip_image_encode.restype = ctypes.c_bool
193195

@@ -258,22 +260,25 @@ def tokenize(self, text: str) -> List[int]:
258260
return [tokens.data[i] for i in range(tokens.size)]
259261

260262
def encode_text(
261-
self, tokens: List[int], n_threads: int = os.cpu_count()
263+
self,
264+
tokens: List[int],
265+
n_threads: int = os.cpu_count(),
266+
normalize: bool = True,
262267
) -> List[float]:
263268
tokens_array = (ClipVocabId * len(tokens))(*tokens)
264269
clip_tokens = ClipTokens(data=tokens_array, size=len(tokens))
265270

266271
txt_vec = (ctypes.c_float * self.vec_dim)()
267272

268273
if not clip_text_encode(
269-
self.ctx, n_threads, ctypes.pointer(clip_tokens), txt_vec
274+
self.ctx, n_threads, ctypes.pointer(clip_tokens), txt_vec, normalize
270275
):
271276
raise RuntimeError("Could not encode text")
272277

273278
return [txt_vec[i] for i in range(self.vec_dim)]
274279

275280
def load_preprocess_encode_image(
276-
self, image_path: str, n_threads: int = os.cpu_count()
281+
self, image_path: str, n_threads: int = os.cpu_count(), normalize: bool = true
277282
) -> List[float]:
278283
image_ptr = make_clip_image_u8()
279284
if not clip_image_load_from_file(image_path.encode("utf8"), image_ptr):
@@ -284,7 +289,9 @@ def load_preprocess_encode_image(
284289
raise RuntimeError("Could not preprocess image")
285290

286291
img_vec = (ctypes.c_float * self.vec_dim)()
287-
if not clip_image_encode(self.ctx, n_threads, processed_image_ptr, img_vec):
292+
if not clip_image_encode(
293+
self.ctx, n_threads, processed_image_ptr, img_vec, normalize
294+
):
288295
raise RuntimeError("Could not encode image")
289296

290297
return [img_vec[i] for i in range(self.vec_dim)]

examples/zsl.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ int main(int argc, char ** argv) {
3535
clip_image_preprocess(ctx, &img0, &img_res);
3636

3737
float img_vec[vec_dim];
38-
if (!clip_image_encode(ctx, params.n_threads, img_res, img_vec)) {
38+
if (!clip_image_encode(ctx, params.n_threads, img_res, img_vec, false)) {
3939
return 1;
4040
}
4141

@@ -45,7 +45,7 @@ int main(int argc, char ** argv) {
4545

4646
for (int i = 0; i < n_labels; i++) {
4747
auto tokens = clip_tokenize(ctx, params.texts[i]);
48-
clip_text_encode(ctx, params.n_threads, tokens, txt_vec);
48+
clip_text_encode(ctx, params.n_threads, tokens, txt_vec, false);
4949
similarities[i] = clip_similarity_score(img_vec, txt_vec, vec_dim);
5050
}
5151

@@ -63,4 +63,4 @@ int main(int argc, char ** argv) {
6363
clip_free(ctx);
6464

6565
return 0;
66-
}
66+
}

tests/benchmark.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ int main(int argc, char ** argv) {
6464

6565
for (const auto & entry : result) {
6666
auto tokens = clip_tokenize(ctx, entry.first);
67-
if (!clip_text_encode(ctx, n_threads, tokens, txt_vecs + label_idx * vec_dim)) {
67+
if (!clip_text_encode(ctx, n_threads, tokens, txt_vecs + label_idx * vec_dim, true)) {
6868
printf("%s: Could not encode the label at index %d: %s\n", __func__, label_idx, entry.first.c_str());
6969
return 1;
7070
}
@@ -111,7 +111,7 @@ int main(int argc, char ** argv) {
111111

112112
clip_image_batch_preprocess(ctx, n_threads, img_inputs, imgs_resized);
113113

114-
clip_image_batch_encode(ctx, n_threads, imgs_resized, img_vecs);
114+
clip_image_batch_encode(ctx, n_threads, imgs_resized, img_vecs, true);
115115

116116
for (size_t b = 0; b < batch_size; b++) {
117117
for (size_t j = 0; j < n_labels; j++) {

0 commit comments

Comments
 (0)