Skip to content

Commit fc008b3

Browse files
authored
Revert "clip : clip.h become private API (⚠️ breaking change) (ggml-org#13510)"
This reverts commit 71bdbdb.
1 parent 7c07ac2 commit fc008b3

File tree

3 files changed

+738
-45
lines changed

3 files changed

+738
-45
lines changed

tools/mtmd/clip.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2309,6 +2309,14 @@ struct clip_model_loader {
23092309
}
23102310
};
23112311

2312+
// read and create ggml_context containing the tensors and their data
2313+
struct clip_ctx * clip_model_load(const char * fname, const int verbosity) {
2314+
return clip_init(fname, clip_context_params{
2315+
/* use_gpu */ true,
2316+
/* verbosity */ static_cast<ggml_log_level>(verbosity),
2317+
});
2318+
}
2319+
23122320
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params) {
23132321
g_logger_state.verbosity_thold = ctx_params.verbosity;
23142322
clip_ctx * ctx_clip = nullptr;
@@ -3077,6 +3085,19 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
30773085
return ctx->vision_model.hparams.image_grid_pinpoints.size();
30783086
}
30793087

3088+
// deprecated
3089+
int clip_n_patches(const struct clip_ctx * ctx) {
3090+
clip_image_f32 img;
3091+
img.nx = ctx->vision_model.hparams.image_size;
3092+
img.ny = ctx->vision_model.hparams.image_size;
3093+
return clip_n_output_tokens(ctx, &img);
3094+
}
3095+
3096+
// deprecated
3097+
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
3098+
return clip_n_output_tokens(ctx, img);
3099+
}
3100+
30803101
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
30813102
const auto & params = ctx->vision_model.hparams;
30823103
const int n_total = clip_n_output_tokens(ctx, img);

tools/mtmd/clip.h

Lines changed: 81 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,28 @@
1-
#pragma once
1+
#ifndef CLIP_H
2+
#define CLIP_H
23

34
#include "ggml.h"
45
#include <stddef.h>
56
#include <stdint.h>
67

8+
#ifdef LLAMA_SHARED
9+
# if defined(_WIN32) && !defined(__MINGW32__)
10+
# ifdef LLAMA_BUILD
11+
# define CLIP_API __declspec(dllexport)
12+
# else
13+
# define CLIP_API __declspec(dllimport)
14+
# endif
15+
# else
16+
# define CLIP_API __attribute__ ((visibility ("default")))
17+
# endif
18+
#else
19+
# define CLIP_API
20+
#endif
21+
22+
#ifdef __cplusplus
23+
extern "C" {
24+
#endif
25+
726
struct clip_ctx;
827

928
struct clip_image_size {
@@ -20,80 +39,97 @@ struct clip_context_params {
2039
enum ggml_log_level verbosity;
2140
};
2241

23-
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
42+
// deprecated, use clip_init
43+
CLIP_API struct clip_ctx * clip_model_load(const char * fname, int verbosity);
44+
45+
CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
2446

25-
void clip_free(struct clip_ctx * ctx);
47+
CLIP_API void clip_free(struct clip_ctx * ctx);
2648

27-
size_t clip_embd_nbytes(const struct clip_ctx * ctx);
28-
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
49+
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
50+
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
2951

30-
int32_t clip_get_image_size (const struct clip_ctx * ctx);
31-
int32_t clip_get_patch_size (const struct clip_ctx * ctx);
32-
int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
52+
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
53+
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
54+
CLIP_API int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
3355

3456
// TODO: should be enum, not string
35-
const char * clip_patch_merge_type(const struct clip_ctx * ctx);
57+
CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
3658

37-
const int32_t * clip_image_grid(const struct clip_ctx * ctx);
38-
size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
59+
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
60+
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
3961

40-
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
62+
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
63+
"use clip_n_output_tokens instead");
64+
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
65+
"use clip_n_output_tokens instead");
66+
67+
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
4168

4269
// for M-RoPE, this will be the number of token positions in X and Y directions
4370
// for other models, X will be the total number of tokens and Y will be 1
44-
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
45-
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
71+
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
72+
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
4673

4774
// this should be equal to the embedding dimension of the text model
48-
int clip_n_mmproj_embd(const struct clip_ctx * ctx);
75+
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
4976

50-
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
51-
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
52-
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
77+
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
78+
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);
79+
CLIP_API struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip);
5380

54-
struct clip_image_size * clip_image_size_init(void);
55-
struct clip_image_u8 * clip_image_u8_init (void);
56-
struct clip_image_f32 * clip_image_f32_init(void);
57-
struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
81+
CLIP_API struct clip_image_size * clip_image_size_init(void);
82+
CLIP_API struct clip_image_u8 * clip_image_u8_init (void);
83+
CLIP_API struct clip_image_f32 * clip_image_f32_init(void);
84+
CLIP_API struct clip_image_f32_batch * clip_image_f32_batch_init(void); // only used by libllava
5885

5986
// nx, ny are the output image dimensions
60-
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
87+
CLIP_API unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny);
6188

62-
void clip_image_size_free (struct clip_image_size * img_size);
63-
void clip_image_u8_free (struct clip_image_u8 * img);
64-
void clip_image_f32_free(struct clip_image_f32 * img);
65-
void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
66-
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
89+
CLIP_API void clip_image_size_free (struct clip_image_size * img_size);
90+
CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
91+
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
92+
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
93+
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);
6794

6895
// use for accessing underlay data of clip_image_f32_batch
69-
size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
70-
size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
71-
size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
72-
struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
96+
CLIP_API size_t clip_image_f32_batch_n_images(const struct clip_image_f32_batch * batch); // equivalent to batch->size()
97+
CLIP_API size_t clip_image_f32_batch_nx(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->nx
98+
CLIP_API size_t clip_image_f32_batch_ny(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->ny
99+
CLIP_API struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch * batch, int idx); // equivalent to batch[idx]->data
73100

74101
/**
75102
* Build image from pixels decoded by other libraries instead of stb_image.h for better performance.
76103
* The memory layout is RGBRGBRGB..., input buffer length must be 3*nx*ny bytes
77104
*/
78-
void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
105+
CLIP_API void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
79106

80-
bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
107+
CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);
81108

82109
/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
83-
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
110+
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);
84111

85112
/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
86-
bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
113+
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
114+
115+
CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
116+
117+
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
118+
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
119+
120+
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
121+
122+
CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
123+
CLIP_API bool clip_is_glm(const struct clip_ctx * ctx);
124+
CLIP_API bool clip_is_qwen2vl(const struct clip_ctx * ctx);
125+
CLIP_API bool clip_is_llava(const struct clip_ctx * ctx);
126+
CLIP_API bool clip_is_gemma3(const struct clip_ctx * ctx);
87127

88-
struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
128+
CLIP_API bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
89129

90-
bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
91-
bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
92130

93-
int clip_is_minicpmv(const struct clip_ctx * ctx);
94-
bool clip_is_glm(const struct clip_ctx * ctx);
95-
bool clip_is_qwen2vl(const struct clip_ctx * ctx);
96-
bool clip_is_llava(const struct clip_ctx * ctx);
97-
bool clip_is_gemma3(const struct clip_ctx * ctx);
131+
#ifdef __cplusplus
132+
}
133+
#endif
98134

99-
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
135+
#endif // CLIP_H

0 commit comments

Comments
 (0)