@@ -586,7 +586,6 @@ static ggml_tensor * build_rope_2d(
586
586
ggml_row_size (cur->type , n_dim),
587
587
ggml_row_size (cur->type , n_dim*n_head),
588
588
0 );
589
- // first = ggml_cont(ctx0, first);
590
589
first = ggml_rope_ext (
591
590
ctx0,
592
591
first,
@@ -599,7 +598,7 @@ static ggml_tensor * build_rope_2d(
599
598
}
600
599
601
600
// second half (write to tmp)
602
- ggml_tensor * second = cur ;
601
+ ggml_tensor * second;
603
602
{
604
603
second = ggml_view_3d (ctx0, cur,
605
604
n_dim/2 , n_head, n_pos,
@@ -2825,9 +2824,20 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2825
2824
2826
2825
{
2827
2826
struct ggml_tensor * inp_raw = ggml_graph_get_tensor (gf, " inp_raw" );
2828
- float * data = (float *)malloc (ggml_nbytes (inp_raw));
2827
+ std::vector<float > inp_data (ggml_nelements (inp_raw));
2828
+ float * data = inp_data.data ();
2829
+
2830
+ // layout of data (note: the channel dim is unrolled to better visualize the layout):
2831
+ //
2832
+ // ┌──W──┐
2833
+ // │ H │ channel = R
2834
+ // ├─────┤ │
2835
+ // │ H │ channel = G
2836
+ // ├─────┤ │
2837
+ // │ H │ channel = B
2838
+ // └─────┘ │
2839
+ // ──────┘ x B
2829
2840
2830
- // TODO @ngxson : this whole code block is ugly, will need to be refactored
2831
2841
for (size_t i = 0 ; i < imgs.entries .size (); i++) {
2832
2842
const int nx = imgs.entries [i]->nx ;
2833
2843
const int ny = imgs.entries [i]->ny ;
@@ -2842,17 +2852,19 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
2842
2852
const int n = nx * ny;
2843
2853
2844
2854
for (int b = 0 ; b < batch_size; b++) {
2845
- for (int k = 0 ; k < 3 ; k++) {
2846
- for (int y = 0 ; y < ny; y++) {
2847
- for (int x = 0 ; x < nx; x++) {
2848
- data[(b * 3 * n) + k * n + y * nx + x] = imgs.entries [b]->buf [3 * (y * nx + x) + k];
2849
- }
2855
+ float * batch_entry = data + b * (3 *n);
2856
+ for (int y = 0 ; y < ny; y++) {
2857
+ for (int x = 0 ; x < nx; x++) {
2858
+ size_t base_src = 3 *(y * nx + x); // idx of the first channel
2859
+ size_t base_dst = y * nx + x; // idx of the first channel
2860
+ batch_entry[ base_dst] = imgs.entries [b]->buf [base_src ];
2861
+ batch_entry[1 *n + base_dst] = imgs.entries [b]->buf [base_src + 1 ];
2862
+ batch_entry[2 *n + base_dst] = imgs.entries [b]->buf [base_src + 2 ];
2850
2863
}
2851
2864
}
2852
2865
}
2853
2866
}
2854
2867
ggml_backend_tensor_set (inp_raw, data, 0 , ggml_nbytes (inp_raw));
2855
- free (data);
2856
2868
}
2857
2869
if (ctx->has_minicpmv_projector ) {
2858
2870
{
0 commit comments