|
40 | 40 | #include <map>
|
41 | 41 | #include <regex>
|
42 | 42 | #include <stdexcept>
|
| 43 | +#include <unordered_set> |
43 | 44 | #include <vector>
|
44 | 45 | #include <sstream>
|
45 | 46 | #include <cinttypes>
|
@@ -447,7 +448,7 @@ struct clip_hparams {
|
447 | 448 |
|
448 | 449 | std::vector<int32_t> image_grid_pinpoints;
|
449 | 450 | int32_t image_crop_resolution;
|
450 |
| - std::vector<int32_t> vision_feature_layer; |
| 451 | + std::unordered_set<int32_t> vision_feature_layer; |
451 | 452 | };
|
452 | 453 |
|
453 | 454 | struct clip_layer {
|
@@ -756,18 +757,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
756 | 757 | }
|
757 | 758 |
|
758 | 759 | std::vector<struct ggml_tensor *> embedding_stack;
|
| 760 | + const auto & vision_feature_layer = hparams.vision_feature_layer; |
759 | 761 |
|
760 | 762 | // loop over layers
|
761 | 763 | for (int il = 0; il < ctx->max_feature_layer; il++) {
|
762 | 764 | struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states
|
763 | 765 |
|
764 | 766 | // If this is an embedding feature layer, save the output.
|
765 | 767 | // NOTE: 0 index here refers to the input to the encoder.
|
766 |
| - for (size_t vl_idx = 0; vl_idx < hparams.vision_feature_layer.size(); vl_idx++) { |
767 |
| - if (il == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) { |
768 |
| - embedding_stack.push_back(embeddings); |
769 |
| - break; |
770 |
| - } |
| 768 | + if (vision_feature_layer.find(il) != vision_feature_layer.end()) { |
| 769 | + embedding_stack.push_back(embeddings); |
771 | 770 | }
|
772 | 771 |
|
773 | 772 | //const size_t nb_q_w = model.layers[il].q_w->nb[0];
|
@@ -868,11 +867,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
868 | 867 | }
|
869 | 868 |
|
870 | 869 | // final layer is a vision feature layer
|
871 |
| - for (size_t vl_idx = 0; vl_idx < hparams.vision_feature_layer.size(); vl_idx++) { |
872 |
| - if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) { |
873 |
| - embedding_stack.push_back(embeddings); |
874 |
| - break; |
875 |
| - } |
| 870 | + if (vision_feature_layer.find(n_layer) != vision_feature_layer.end()) { |
| 871 | + embedding_stack.push_back(embeddings); |
876 | 872 | }
|
877 | 873 |
|
878 | 874 | // If feature layers are explicitly set, stack them (if we have multiple)
|
@@ -1486,7 +1482,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
1486 | 1482 | const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx);
|
1487 | 1483 |
|
1488 | 1484 | for (int i = 0; i < n; ++i) {
|
1489 |
| - hparams.vision_feature_layer.push_back(vision_feature_layer[i]); |
| 1485 | + hparams.vision_feature_layer.insert(vision_feature_layer[i]); |
1490 | 1486 | }
|
1491 | 1487 | } catch (std::runtime_error & /*e*/) { }
|
1492 | 1488 |
|
@@ -1530,13 +1526,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
1530 | 1526 | LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]);
|
1531 | 1527 | LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]);
|
1532 | 1528 | LOG_INF("v_image_grid_pinpoints: ");
|
1533 |
| - for (size_t i = 0; i < hparams.image_grid_pinpoints.size(); ++i) { |
1534 |
| - LOG_INF("%d ", hparams.image_grid_pinpoints[i]); |
| 1529 | + for (const auto & pp : hparams.image_grid_pinpoints) { |
| 1530 | + LOG_INF("%d ", pp); |
1535 | 1531 | }
|
1536 | 1532 | LOG_INF("\n");
|
1537 | 1533 | LOG_INF("v_vision_feature_layer: ");
|
1538 |
| - for (size_t i = 0; i < hparams.vision_feature_layer.size(); i++) { |
1539 |
| - LOG_INF("%d ", hparams.vision_feature_layer[i]); |
| 1534 | + for (const auto & feature_layer: hparams.vision_feature_layer) { |
| 1535 | + LOG_INF("%d ", feature_layer); |
1540 | 1536 | }
|
1541 | 1537 | LOG_INF("\n");
|
1542 | 1538 | LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type);
|
@@ -2997,9 +2993,9 @@ int get_deepest_feature_layer(const struct clip_ctx * ctx) {
|
2997 | 2993 | }
|
2998 | 2994 |
|
2999 | 2995 | // If we set explicit vision feature layers, only go up to the deepest one
|
3000 |
| - for (size_t i = 0; i < hparams.vision_feature_layer.size(); i++) { |
3001 |
| - if (hparams.vision_feature_layer[i] > deepest_feature_layer) { |
3002 |
| - deepest_feature_layer = hparams.vision_feature_layer[i]; |
| 2996 | + for (const auto & feature_layer: hparams.vision_feature_layer) { |
| 2997 | + if (feature_layer > deepest_feature_layer) { |
| 2998 | + deepest_feature_layer = feature_layer; |
3003 | 2999 | }
|
3004 | 3000 | }
|
3005 | 3001 | return deepest_feature_layer < 0 ? n_layer: deepest_feature_layer;
|
|
0 commit comments