remove not so often use qwen2vl-cli debug functions

HimariO · HimariO · commit f982cc4e17eb · 2025-04-04T15:21:04.000+08:00
diff --git a/examples/llava/qwen2vl-cli.cpp b/examples/llava/qwen2vl-cli.cpp
@@ -486,280 +486,6 @@ static void debug_test_mrope_2d() {
     ggml_backend_free(backend);
 }
 
-static void debug_patch_layout() {
-    // 1. Initialize backend
-    ggml_backend_t backend = NULL;
-    std::string backend_name = "";
-// #ifdef GGML_USE_CUDA
-//     fprintf(stderr, "%s: using CUDA backend\n", __func__);
-//     backend = ggml_backend_cuda_init(0); // init device 0
-//     backend_name = "cuda";
-//     if (!backend) {
-//         fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-//     }
-// #endif
-    // if there aren't GPU Backends fallback to CPU backend
-    if (!backend) {
-        backend = ggml_backend_cpu_init();
-        backend_name = "cpu";
-    }
-
-    // Calculate the size needed to allocate
-    size_t ctx_size = 0;
-    ctx_size += 2 * ggml_tensor_overhead(); // tensors
-    // no need to allocate anything else!
-
-    // 2. Allocate `ggml_context` to store tensor data
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
-    };
-    struct ggml_context * ctx = ggml_init(params);
-
-    const int patches_w = 14;
-    const int patches_h = 10;
-    const int c = 2;
-    const int batch_size = 1;
-    struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, patches_w, patches_h, c, batch_size);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
-
-
-    std::vector<float> dummy_q;
-    dummy_q.resize(patches_w * patches_h * c * batch_size);
-    for (size_t i = 0; i < patches_h * patches_w * c; i++)
-    {
-        dummy_q[i] = i;
-    }
-
-    // std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
-    // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
-
-    // 4. Allocate a `ggml_backend_buffer` to store all tensors
-    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-
-    // 5. Copy tensor data from main memory (RAM) to backend buffer
-    ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
-
-    // 6. Create a `ggml_cgraph` for mul_mat operation
-    struct ggml_cgraph * gf = NULL;
-    struct ggml_context * ctx0 = NULL;
-
-    // create a temporally context to build the graph
-    struct ggml_init_params params0 = {
-        /*.mem_size   =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
-    };
-    ctx0 = ggml_init(params0);
-    gf = ggml_new_graph(ctx0);
-    /*
-        Compute graph
-    */
-    struct ggml_tensor * inp = ggml_cont(ctx0, ggml_permute(ctx0, inp_raw, 1, 2, 0, 3));  // [w, h, c, b] -> [c, w, h, b]
-
-    inp = ggml_reshape_4d(
-        ctx0, inp,
-        c * 2, patches_w / 2, patches_h, batch_size);
-    inp = ggml_reshape_4d(
-        ctx0, inp,
-        c * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
-    inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
-    inp = ggml_reshape_3d(
-        ctx0, inp,
-        c, patches_w * patches_h, batch_size);
-
-    // Add "result" tensor and all of its dependencies to the cgraph
-    ggml_build_forward_expand(gf, inp);
-
-    // 7. Create a `ggml_gallocr` for cgraph computation
-    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    ggml_gallocr_alloc_graph(allocr, gf);
-
-    // 9. Run the computation
-    int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
-    if (ggml_backend_is_cpu(backend)) {
-        ggml_backend_cpu_set_n_threads(backend, n_threads);
-    }
-    ggml_backend_graph_compute(backend, gf);
-
-    // 10. Retrieve results (output tensors)
-    // in this example, output tensor is always the last tensor in the graph
-    struct ggml_tensor * result = inp;
-    // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
-    float * result_data = (float *)malloc(ggml_nbytes(result));
-    // because the tensor data is stored in device buffer, we need to copy it back to RAM
-    ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
-    const std::string bin_file = "patch_layout_" + backend_name +".bin";
-    std::ofstream outFile(bin_file, std::ios::binary);
-
-    if (outFile.is_open()) {
-        outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
-        outFile.close();
-        std::cout << "Data successfully written to " + bin_file << std::endl;
-    } else {
-        std::cerr << "Error opening file!" << std::endl;
-    }
-
-    free(result_data);
-    // 11. Free memory and exit
-    ggml_free(ctx0);
-    ggml_gallocr_free(allocr);
-    ggml_free(ctx);
-    ggml_backend_buffer_free(buffer);
-    ggml_backend_free(backend);
-}
-
-static void debug_test_get_rows() {
-    // 1. Initialize backend
-    ggml_backend_t backend = NULL;
-    std::string backend_name = "";
-// #ifdef GGML_USE_CUDA
-//     fprintf(stderr, "%s: using CUDA backend\n", __func__);
-//     backend = ggml_backend_cuda_init(0); // init device 0
-//     backend_name = "cuda";
-//     if (!backend) {
-//         fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-//     }
-// #endif
-    // if there aren't GPU Backends fallback to CPU backend
-    if (!backend) {
-        backend = ggml_backend_cpu_init();
-        backend_name = "cpu";
-    }
-
-    // Calculate the size needed to allocate
-    size_t ctx_size = 0;
-    ctx_size += 128 * ggml_tensor_overhead(); // tensors
-    // no need to allocate anything else!
-
-    // 2. Allocate `ggml_context` to store tensor data
-    struct ggml_init_params params = {
-        /*.mem_size   =*/ ctx_size,
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
-    };
-    struct ggml_context * ctx = ggml_init(params);
-
-    const int tokens = 30;
-    struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 3, tokens * 2);
-    ggml_set_name(inp_raw, "inp_raw");
-    ggml_set_input(inp_raw);
-
-    struct ggml_tensor * pos = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 4, tokens);
-    // struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens * 4);
-    ggml_set_name(pos, "pos");
-    ggml_set_input(pos);
-
-    struct ggml_tensor * ind = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens);
-    ggml_set_name(ind, "ind");
-    ggml_set_input(ind);
-
-    struct ggml_tensor * ind_2d = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 1, tokens);
-    ggml_set_name(ind_2d, "ind_2d");
-    ggml_set_input(ind_2d);
-
-    std::vector<float> dummy_q;
-    dummy_q.resize(128 * 3 * inp_raw->ne[2]);
-    for (int i = 0; i < inp_raw->ne[2]; i ++) {
-        for (int j = 0; j < 3; j ++) {
-            int offset = i * 128 * 3 + j * 128;
-            std::fill(dummy_q.begin() + offset, dummy_q.begin() + offset + 128, 0.1 * i);
-        }
-    }
-    // std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
-    // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
-
-    std::vector<int> pos_id;
-    pos_id.resize(tokens * 4);
-    for (int i = 0; i < tokens; i ++) {
-        pos_id[i] = i;
-        pos_id[i + tokens * 1] = i + 10;
-        pos_id[i + tokens * 2] = i + 20;
-        pos_id[i + tokens * 3] = i + 30;
-    }
-
-    std::vector<int> remap_ind;
-    remap_ind.resize(tokens * 4);
-    for (int i = 0; i < tokens; i ++) {
-        remap_ind[i] = tokens - i - 1;
-    }
-
-    // 4. Allocate a `ggml_backend_buffer` to store all tensors
-    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
-
-    // 5. Copy tensor data from main memory (RAM) to backend buffer
-    ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
-    ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
-    ggml_backend_tensor_set(ind, remap_ind.data(), 0, ggml_nbytes(ind));
-    ggml_backend_tensor_set(ind_2d, remap_ind.data(), 0, ggml_nbytes(ind_2d));
-
-    // 6. Create a `ggml_cgraph` for mul_mat operation
-    struct ggml_cgraph * gf = NULL;
-    struct ggml_context * ctx_cgraph = NULL;
-
-    // create a temporally context to build the graph
-    struct ggml_init_params params0 = {
-        /*.mem_size   =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
-        /*.mem_buffer =*/ NULL,
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
-    };
-    ctx_cgraph = ggml_init(params0);
-    gf = ggml_new_graph(ctx_cgraph);
-
-    // ne = [128, 1, 30, 1]
-    auto x = ggml_reshape_2d(ctx_cgraph, inp_raw, 128 * 3 * 2, tokens);
-    struct ggml_tensor * result0 = ggml_get_rows(
-        ctx_cgraph, x, ind);
-    result0 = ggml_reshape_3d(ctx_cgraph, result0, 128, 3, tokens * 2);
-
-    struct ggml_tensor * result1 = ggml_get_rows(
-        ctx_cgraph, pos, ind);
-
-    // Add "result" tensor and all of its dependencies to the cgraph
-    ggml_build_forward_expand(gf, result0);
-    ggml_build_forward_expand(gf, result1);
-
-    // 7. Create a `ggml_gallocr` for cgraph computation
-    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
-    ggml_gallocr_alloc_graph(allocr, gf);
-
-    // 9. Run the computation
-    int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
-    if (ggml_backend_is_cpu(backend)) {
-        ggml_backend_cpu_set_n_threads(backend, n_threads);
-    }
-    ggml_backend_graph_compute(backend, gf);
-
-    // 10. Retrieve results (output tensors)
-    // in this example, output tensor is always the last tensor in the graph
-    struct ggml_tensor * result = result0;
-    // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
-    float * result_data = (float *)malloc(ggml_nbytes(result));
-    // because the tensor data is stored in device buffer, we need to copy it back to RAM
-    ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
-    const std::string bin_file = "getrows_" + backend_name +"_0.bin";
-    std::ofstream outFile(bin_file, std::ios::binary);
-
-    if (outFile.is_open()) {
-        outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
-        outFile.close();
-        std::cout << "Data successfully written to " + bin_file << std::endl;
-    } else {
-        std::cerr << "Error opening file!" << std::endl;
-    }
-
-    free(result_data);
-    // 11. Free memory and exit
-    ggml_free(ctx_cgraph);
-    ggml_gallocr_free(allocr);
-    ggml_free(ctx);
-    ggml_backend_buffer_free(buffer);
-    ggml_backend_free(backend);
-}
-
-
 enum model_output_type {
     conv3d,
     patch_embed,
@@ -955,9 +681,6 @@ int main(int argc, char ** argv) {
         // debug_test_mrope_2d();
         debug_dump_img_embed(ctx_llava, model_output_type::final_layer);
         // debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
-        // debug_test_get_rows();
-        // dump_win_attn_mask();
-        // debug_patch_layout();
 
         llama_perf_context_print(ctx_llava->ctx_llama);
         ctx_llava->model = NULL;