Skip to content

Commit 8676316

Browse files
Use unordered set to store feature layers
Co-authored-by: Xuan-Son Nguyen <[email protected]> Signed-off-by: Alex-Brooks <[email protected]>
1 parent 188bfb0 commit 8676316

File tree

1 file changed

+15
-19
lines changed

1 file changed

+15
-19
lines changed

examples/llava/clip.cpp

Lines changed: 15 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@
4040
#include <map>
4141
#include <regex>
4242
#include <stdexcept>
43+
#include <unordered_set>
4344
#include <vector>
4445
#include <sstream>
4546
#include <cinttypes>
@@ -447,7 +448,7 @@ struct clip_hparams {
447448

448449
std::vector<int32_t> image_grid_pinpoints;
449450
int32_t image_crop_resolution;
450-
std::vector<int32_t> vision_feature_layer;
451+
std::unordered_set<int32_t> vision_feature_layer;
451452
};
452453

453454
struct clip_layer {
@@ -756,18 +757,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
756757
}
757758

758759
std::vector<struct ggml_tensor *> embedding_stack;
760+
const auto & vision_feature_layer = hparams.vision_feature_layer;
759761

760762
// loop over layers
761763
for (int il = 0; il < ctx->max_feature_layer; il++) {
762764
struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
763765

764766
// If this is an embedding feature layer, save the output.
765767
// NOTE: 0 index here refers to the input to the encoder.
766-
for (size_t vl_idx = 0; vl_idx < hparams.vision_feature_layer.size(); vl_idx++) {
767-
if (il == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) {
768-
embedding_stack.push_back(embeddings);
769-
break;
770-
}
768+
if (vision_feature_layer.find(il) != vision_feature_layer.end()) {
769+
embedding_stack.push_back(embeddings);
771770
}
772771

773772
//const size_t nb_q_w = model.layers[il].q_w->nb[0];
@@ -868,11 +867,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
868867
}
869868

870869
// final layer is a vision feature layer
871-
for (size_t vl_idx = 0; vl_idx < hparams.vision_feature_layer.size(); vl_idx++) {
872-
if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) {
873-
embedding_stack.push_back(embeddings);
874-
break;
875-
}
870+
if (vision_feature_layer.find(n_layer) != vision_feature_layer.end()) {
871+
embedding_stack.push_back(embeddings);
876872
}
877873

878874
// If feature layers are explicitly set, stack them (if we have multiple)
@@ -1486,7 +1482,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
14861482
const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
14871483

14881484
for (int i = 0; i < n; ++i) {
1489-
hparams.vision_feature_layer.push_back(vision_feature_layer[i]);
1485+
hparams.vision_feature_layer.insert(vision_feature_layer[i]);
14901486
}
14911487
} catch (std::runtime_error & /*e*/) { }
14921488

@@ -1530,13 +1526,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
15301526
LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
15311527
LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
15321528
LOG_INF("v_image_grid_pinpoints: ");
1533-
for (size_t i = 0; i < hparams.image_grid_pinpoints.size(); ++i) {
1534-
LOG_INF("%d ", hparams.image_grid_pinpoints[i]);
1529+
for (const auto & pp : hparams.image_grid_pinpoints) {
1530+
LOG_INF("%d ", pp);
15351531
}
15361532
LOG_INF("\n");
15371533
LOG_INF("v_vision_feature_layer: ");
1538-
for (size_t i = 0; i < hparams.vision_feature_layer.size(); i++) {
1539-
LOG_INF("%d ", hparams.vision_feature_layer[i]);
1534+
for (const auto & feature_layer: hparams.vision_feature_layer) {
1535+
LOG_INF("%d ", feature_layer);
15401536
}
15411537
LOG_INF("\n");
15421538
LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
@@ -2997,9 +2993,9 @@ int get_deepest_feature_layer(const struct clip_ctx * ctx) {
29972993
}
29982994

29992995
// If we set explicit vision feature layers, only go up to the deepest one
3000-
for (size_t i = 0; i < hparams.vision_feature_layer.size(); i++) {
3001-
if (hparams.vision_feature_layer[i] > deepest_feature_layer) {
3002-
deepest_feature_layer = hparams.vision_feature_layer[i];
2996+
for (const auto & feature_layer: hparams.vision_feature_layer) {
2997+
if (feature_layer > deepest_feature_layer) {
2998+
deepest_feature_layer = feature_layer;
30032999
}
30043000
}
30053001
return deepest_feature_layer < 0 ? n_layer: deepest_feature_layer;

0 commit comments

Comments
 (0)