Skip to content

llava: change llava API to pure C style for Rust FFI bindgen #6079

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Mar 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 18 additions & 18 deletions examples/llava/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1235,16 +1235,16 @@ struct clip_image_f32 * clip_image_f32_init() {

void clip_image_u8_free(struct clip_image_u8 * img) { delete img; }
void clip_image_f32_free(struct clip_image_f32 * img) { delete img; }
void clip_image_u8_batch_free(struct clip_image_u8_batch & batch) {
if (batch.size > 0) {
delete[] batch.data;
batch.size = 0;
void clip_image_u8_batch_free(struct clip_image_u8_batch * batch) {
if (batch->size > 0) {
delete[] batch->data;
batch->size = 0;
}
}
void clip_image_f32_batch_free(struct clip_image_f32_batch & batch) {
if (batch.size > 0) {
delete[] batch.data;
batch.size = 0;
void clip_image_f32_batch_free(struct clip_image_f32_batch * batch) {
if (batch->size > 0) {
delete[] batch->data;
batch->size = 0;
}
}

Expand Down Expand Up @@ -1497,7 +1497,7 @@ static std::vector<clip_image_u8*> divide_to_patches_u8(const clip_image_u8 & im

// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
// res_imgs memory is being allocated here, previous allocations will be freed if found
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs) {
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch * res_imgs) {
bool pad_to_square = true;
if (!ctx->has_vision_encoder) {
printf("This gguf file seems to have no vision encoder\n");
Expand All @@ -1509,11 +1509,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
pad_to_square = false;
}
// free the previous res_imgs if any set
if (res_imgs.size > 0) {
if (res_imgs->size > 0) {
clip_image_f32_batch_free(res_imgs);
}
res_imgs.data = nullptr;
res_imgs.size = 0;
res_imgs->data = nullptr;
res_imgs->size = 0;

// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
// see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
Expand Down Expand Up @@ -1568,11 +1568,11 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
bicubic_resize(*img, *image_original_resize, params.image_size, params.image_size); // in python this is "shortest_edge", but all CLIP are square
patches.insert(patches.begin(), image_original_resize);
// clip_image_f32_batch_init(patches.size());
res_imgs.size = patches.size();
res_imgs.data = new clip_image_f32[res_imgs.size];
res_imgs->size = patches.size();
res_imgs->data = new clip_image_f32[res_imgs->size];
int num=0;
for (auto& patch : patches) {
normalize_image_u8_to_f32(patch, &res_imgs.data[num], ctx->image_mean, ctx->image_std);
normalize_image_u8_to_f32(patch, &res_imgs->data[num], ctx->image_mean, ctx->image_std);
num++;
}

Expand Down Expand Up @@ -1660,9 +1660,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, cli
// }
// res_imgs.push_back(res);

res_imgs.size = 1;
res_imgs.data = new clip_image_f32[res_imgs.size];
res_imgs.data[0] = *res;
res_imgs->size = 1;
res_imgs->data = new clip_image_f32[res_imgs->size];
res_imgs->data[0] = *res;
clip_image_f32_free(res);

return true;
Expand Down
6 changes: 3 additions & 3 deletions examples/llava/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,16 +60,16 @@ CLIP_API struct clip_image_f32 * clip_image_f32_init();

CLIP_API void clip_image_u8_free (struct clip_image_u8 * img);
CLIP_API void clip_image_f32_free(struct clip_image_f32 * img);
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch & batch);
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch & batch);
CLIP_API void clip_image_u8_batch_free (struct clip_image_u8_batch * batch);
CLIP_API void clip_image_f32_batch_free(struct clip_image_f32_batch * batch);

CLIP_API bool clip_image_load_from_file(const char * fname, struct clip_image_u8 * img);

/** interpret bytes as an image file with length bytes_length, and use the result to populate img */
CLIP_API bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img);

/** preprocess img and store the result in res_imgs, pad_to_square may be overriden to false depending on model configuration */
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, clip_image_f32_batch & res_imgs );
CLIP_API bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );

CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);

Expand Down
2 changes: 1 addition & 1 deletion examples/llava/llava.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
clip_image_f32_batch img_res_v;
img_res_v.size = 0;
img_res_v.data = nullptr;
if (!clip_image_preprocess(ctx_clip, img, img_res_v)) {
if (!clip_image_preprocess(ctx_clip, img, &img_res_v)) {
fprintf(stderr, "%s: unable to preprocess image\n", __func__);
delete[] img_res_v.data;
return false;
Expand Down
4 changes: 2 additions & 2 deletions examples/llava/llava.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ struct llava_image_embed {
};

/** sanity check for clip <-> llava embed size match */
LLAVA_API bool llava_validate_embed_size(const llama_context * ctx_llama, const clip_ctx * ctx_clip);
LLAVA_API bool llava_validate_embed_size(const struct llama_context * ctx_llama, const struct clip_ctx * ctx_clip);

LLAVA_API bool llava_image_embed_make_with_clip_img(clip_ctx * ctx_clip, int n_threads, const clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);
LLAVA_API bool llava_image_embed_make_with_clip_img(struct clip_ctx * ctx_clip, int n_threads, const struct clip_image_u8 * img, float ** image_embd_out, int * n_img_pos_out);

/** build an image embed from image file bytes */
LLAVA_API struct llava_image_embed * llava_image_embed_make_with_bytes(struct clip_ctx * ctx_clip, int n_threads, const unsigned char * image_bytes, int image_bytes_length);
Expand Down