Skip to content

Commit f982cc4

Browse files
committed
remove not so often use qwen2vl-cli debug functions
1 parent 0664dba commit f982cc4

File tree

1 file changed

+0
-277
lines changed

1 file changed

+0
-277
lines changed

examples/llava/qwen2vl-cli.cpp

Lines changed: 0 additions & 277 deletions
Original file line numberDiff line numberDiff line change
@@ -486,280 +486,6 @@ static void debug_test_mrope_2d() {
486486
ggml_backend_free(backend);
487487
}
488488

489-
static void debug_patch_layout() {
490-
// 1. Initialize backend
491-
ggml_backend_t backend = NULL;
492-
std::string backend_name = "";
493-
// #ifdef GGML_USE_CUDA
494-
// fprintf(stderr, "%s: using CUDA backend\n", __func__);
495-
// backend = ggml_backend_cuda_init(0); // init device 0
496-
// backend_name = "cuda";
497-
// if (!backend) {
498-
// fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
499-
// }
500-
// #endif
501-
// if there aren't GPU Backends fallback to CPU backend
502-
if (!backend) {
503-
backend = ggml_backend_cpu_init();
504-
backend_name = "cpu";
505-
}
506-
507-
// Calculate the size needed to allocate
508-
size_t ctx_size = 0;
509-
ctx_size += 2 * ggml_tensor_overhead(); // tensors
510-
// no need to allocate anything else!
511-
512-
// 2. Allocate `ggml_context` to store tensor data
513-
struct ggml_init_params params = {
514-
/*.mem_size =*/ ctx_size,
515-
/*.mem_buffer =*/ NULL,
516-
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
517-
};
518-
struct ggml_context * ctx = ggml_init(params);
519-
520-
const int patches_w = 14;
521-
const int patches_h = 10;
522-
const int c = 2;
523-
const int batch_size = 1;
524-
struct ggml_tensor * inp_raw = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, patches_w, patches_h, c, batch_size);
525-
ggml_set_name(inp_raw, "inp_raw");
526-
ggml_set_input(inp_raw);
527-
528-
529-
std::vector<float> dummy_q;
530-
dummy_q.resize(patches_w * patches_h * c * batch_size);
531-
for (size_t i = 0; i < patches_h * patches_w * c; i++)
532-
{
533-
dummy_q[i] = i;
534-
}
535-
536-
// std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
537-
// memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
538-
539-
// 4. Allocate a `ggml_backend_buffer` to store all tensors
540-
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
541-
542-
// 5. Copy tensor data from main memory (RAM) to backend buffer
543-
ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
544-
545-
// 6. Create a `ggml_cgraph` for mul_mat operation
546-
struct ggml_cgraph * gf = NULL;
547-
struct ggml_context * ctx0 = NULL;
548-
549-
// create a temporally context to build the graph
550-
struct ggml_init_params params0 = {
551-
/*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
552-
/*.mem_buffer =*/ NULL,
553-
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
554-
};
555-
ctx0 = ggml_init(params0);
556-
gf = ggml_new_graph(ctx0);
557-
/*
558-
Compute graph
559-
*/
560-
struct ggml_tensor * inp = ggml_cont(ctx0, ggml_permute(ctx0, inp_raw, 1, 2, 0, 3)); // [w, h, c, b] -> [c, w, h, b]
561-
562-
inp = ggml_reshape_4d(
563-
ctx0, inp,
564-
c * 2, patches_w / 2, patches_h, batch_size);
565-
inp = ggml_reshape_4d(
566-
ctx0, inp,
567-
c * 2, patches_w / 2, 2, batch_size * (patches_h / 2));
568-
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 0, 2, 1, 3));
569-
inp = ggml_reshape_3d(
570-
ctx0, inp,
571-
c, patches_w * patches_h, batch_size);
572-
573-
// Add "result" tensor and all of its dependencies to the cgraph
574-
ggml_build_forward_expand(gf, inp);
575-
576-
// 7. Create a `ggml_gallocr` for cgraph computation
577-
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
578-
ggml_gallocr_alloc_graph(allocr, gf);
579-
580-
// 9. Run the computation
581-
int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
582-
if (ggml_backend_is_cpu(backend)) {
583-
ggml_backend_cpu_set_n_threads(backend, n_threads);
584-
}
585-
ggml_backend_graph_compute(backend, gf);
586-
587-
// 10. Retrieve results (output tensors)
588-
// in this example, output tensor is always the last tensor in the graph
589-
struct ggml_tensor * result = inp;
590-
// struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
591-
float * result_data = (float *)malloc(ggml_nbytes(result));
592-
// because the tensor data is stored in device buffer, we need to copy it back to RAM
593-
ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
594-
const std::string bin_file = "patch_layout_" + backend_name +".bin";
595-
std::ofstream outFile(bin_file, std::ios::binary);
596-
597-
if (outFile.is_open()) {
598-
outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
599-
outFile.close();
600-
std::cout << "Data successfully written to " + bin_file << std::endl;
601-
} else {
602-
std::cerr << "Error opening file!" << std::endl;
603-
}
604-
605-
free(result_data);
606-
// 11. Free memory and exit
607-
ggml_free(ctx0);
608-
ggml_gallocr_free(allocr);
609-
ggml_free(ctx);
610-
ggml_backend_buffer_free(buffer);
611-
ggml_backend_free(backend);
612-
}
613-
614-
static void debug_test_get_rows() {
615-
// 1. Initialize backend
616-
ggml_backend_t backend = NULL;
617-
std::string backend_name = "";
618-
// #ifdef GGML_USE_CUDA
619-
// fprintf(stderr, "%s: using CUDA backend\n", __func__);
620-
// backend = ggml_backend_cuda_init(0); // init device 0
621-
// backend_name = "cuda";
622-
// if (!backend) {
623-
// fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
624-
// }
625-
// #endif
626-
// if there aren't GPU Backends fallback to CPU backend
627-
if (!backend) {
628-
backend = ggml_backend_cpu_init();
629-
backend_name = "cpu";
630-
}
631-
632-
// Calculate the size needed to allocate
633-
size_t ctx_size = 0;
634-
ctx_size += 128 * ggml_tensor_overhead(); // tensors
635-
// no need to allocate anything else!
636-
637-
// 2. Allocate `ggml_context` to store tensor data
638-
struct ggml_init_params params = {
639-
/*.mem_size =*/ ctx_size,
640-
/*.mem_buffer =*/ NULL,
641-
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
642-
};
643-
struct ggml_context * ctx = ggml_init(params);
644-
645-
const int tokens = 30;
646-
struct ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, 128, 3, tokens * 2);
647-
ggml_set_name(inp_raw, "inp_raw");
648-
ggml_set_input(inp_raw);
649-
650-
struct ggml_tensor * pos = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 4, tokens);
651-
// struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens * 4);
652-
ggml_set_name(pos, "pos");
653-
ggml_set_input(pos);
654-
655-
struct ggml_tensor * ind = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens);
656-
ggml_set_name(ind, "ind");
657-
ggml_set_input(ind);
658-
659-
struct ggml_tensor * ind_2d = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 1, tokens);
660-
ggml_set_name(ind_2d, "ind_2d");
661-
ggml_set_input(ind_2d);
662-
663-
std::vector<float> dummy_q;
664-
dummy_q.resize(128 * 3 * inp_raw->ne[2]);
665-
for (int i = 0; i < inp_raw->ne[2]; i ++) {
666-
for (int j = 0; j < 3; j ++) {
667-
int offset = i * 128 * 3 + j * 128;
668-
std::fill(dummy_q.begin() + offset, dummy_q.begin() + offset + 128, 0.1 * i);
669-
}
670-
}
671-
// std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
672-
// memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
673-
674-
std::vector<int> pos_id;
675-
pos_id.resize(tokens * 4);
676-
for (int i = 0; i < tokens; i ++) {
677-
pos_id[i] = i;
678-
pos_id[i + tokens * 1] = i + 10;
679-
pos_id[i + tokens * 2] = i + 20;
680-
pos_id[i + tokens * 3] = i + 30;
681-
}
682-
683-
std::vector<int> remap_ind;
684-
remap_ind.resize(tokens * 4);
685-
for (int i = 0; i < tokens; i ++) {
686-
remap_ind[i] = tokens - i - 1;
687-
}
688-
689-
// 4. Allocate a `ggml_backend_buffer` to store all tensors
690-
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
691-
692-
// 5. Copy tensor data from main memory (RAM) to backend buffer
693-
ggml_backend_tensor_set(inp_raw, dummy_q.data(), 0, ggml_nbytes(inp_raw));
694-
ggml_backend_tensor_set(pos, pos_id.data(), 0, ggml_nbytes(pos));
695-
ggml_backend_tensor_set(ind, remap_ind.data(), 0, ggml_nbytes(ind));
696-
ggml_backend_tensor_set(ind_2d, remap_ind.data(), 0, ggml_nbytes(ind_2d));
697-
698-
// 6. Create a `ggml_cgraph` for mul_mat operation
699-
struct ggml_cgraph * gf = NULL;
700-
struct ggml_context * ctx_cgraph = NULL;
701-
702-
// create a temporally context to build the graph
703-
struct ggml_init_params params0 = {
704-
/*.mem_size =*/ ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(),
705-
/*.mem_buffer =*/ NULL,
706-
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
707-
};
708-
ctx_cgraph = ggml_init(params0);
709-
gf = ggml_new_graph(ctx_cgraph);
710-
711-
// ne = [128, 1, 30, 1]
712-
auto x = ggml_reshape_2d(ctx_cgraph, inp_raw, 128 * 3 * 2, tokens);
713-
struct ggml_tensor * result0 = ggml_get_rows(
714-
ctx_cgraph, x, ind);
715-
result0 = ggml_reshape_3d(ctx_cgraph, result0, 128, 3, tokens * 2);
716-
717-
struct ggml_tensor * result1 = ggml_get_rows(
718-
ctx_cgraph, pos, ind);
719-
720-
// Add "result" tensor and all of its dependencies to the cgraph
721-
ggml_build_forward_expand(gf, result0);
722-
ggml_build_forward_expand(gf, result1);
723-
724-
// 7. Create a `ggml_gallocr` for cgraph computation
725-
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
726-
ggml_gallocr_alloc_graph(allocr, gf);
727-
728-
// 9. Run the computation
729-
int n_threads = 1; // Optional: number of threads to perform some operations with multi-threading
730-
if (ggml_backend_is_cpu(backend)) {
731-
ggml_backend_cpu_set_n_threads(backend, n_threads);
732-
}
733-
ggml_backend_graph_compute(backend, gf);
734-
735-
// 10. Retrieve results (output tensors)
736-
// in this example, output tensor is always the last tensor in the graph
737-
struct ggml_tensor * result = result0;
738-
// struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
739-
float * result_data = (float *)malloc(ggml_nbytes(result));
740-
// because the tensor data is stored in device buffer, we need to copy it back to RAM
741-
ggml_backend_tensor_get(result, result_data, 0, ggml_nbytes(result));
742-
const std::string bin_file = "getrows_" + backend_name +"_0.bin";
743-
std::ofstream outFile(bin_file, std::ios::binary);
744-
745-
if (outFile.is_open()) {
746-
outFile.write(reinterpret_cast<const char*>(result_data), ggml_nbytes(result));
747-
outFile.close();
748-
std::cout << "Data successfully written to " + bin_file << std::endl;
749-
} else {
750-
std::cerr << "Error opening file!" << std::endl;
751-
}
752-
753-
free(result_data);
754-
// 11. Free memory and exit
755-
ggml_free(ctx_cgraph);
756-
ggml_gallocr_free(allocr);
757-
ggml_free(ctx);
758-
ggml_backend_buffer_free(buffer);
759-
ggml_backend_free(backend);
760-
}
761-
762-
763489
enum model_output_type {
764490
conv3d,
765491
patch_embed,
@@ -955,9 +681,6 @@ int main(int argc, char ** argv) {
955681
// debug_test_mrope_2d();
956682
debug_dump_img_embed(ctx_llava, model_output_type::final_layer);
957683
// debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
958-
// debug_test_get_rows();
959-
// dump_win_attn_mask();
960-
// debug_patch_layout();
961684

962685
llama_perf_context_print(ctx_llava->ctx_llama);
963686
ctx_llava->model = NULL;

0 commit comments

Comments
 (0)