@@ -486,280 +486,6 @@ static void debug_test_mrope_2d() {
486
486
ggml_backend_free (backend);
487
487
}
488
488
489
- static void debug_patch_layout () {
490
- // 1. Initialize backend
491
- ggml_backend_t backend = NULL ;
492
- std::string backend_name = " " ;
493
- // #ifdef GGML_USE_CUDA
494
- // fprintf(stderr, "%s: using CUDA backend\n", __func__);
495
- // backend = ggml_backend_cuda_init(0); // init device 0
496
- // backend_name = "cuda";
497
- // if (!backend) {
498
- // fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
499
- // }
500
- // #endif
501
- // if there aren't GPU Backends fallback to CPU backend
502
- if (!backend) {
503
- backend = ggml_backend_cpu_init ();
504
- backend_name = " cpu" ;
505
- }
506
-
507
- // Calculate the size needed to allocate
508
- size_t ctx_size = 0 ;
509
- ctx_size += 2 * ggml_tensor_overhead (); // tensors
510
- // no need to allocate anything else!
511
-
512
- // 2. Allocate `ggml_context` to store tensor data
513
- struct ggml_init_params params = {
514
- /* .mem_size =*/ ctx_size,
515
- /* .mem_buffer =*/ NULL ,
516
- /* .no_alloc =*/ true , // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
517
- };
518
- struct ggml_context * ctx = ggml_init (params);
519
-
520
- const int patches_w = 14 ;
521
- const int patches_h = 10 ;
522
- const int c = 2 ;
523
- const int batch_size = 1 ;
524
- struct ggml_tensor * inp_raw = ggml_new_tensor_4d (ctx, GGML_TYPE_F32, patches_w, patches_h, c, batch_size);
525
- ggml_set_name (inp_raw, " inp_raw" );
526
- ggml_set_input (inp_raw);
527
-
528
-
529
- std::vector<float > dummy_q;
530
- dummy_q.resize (patches_w * patches_h * c * batch_size);
531
- for (size_t i = 0 ; i < patches_h * patches_w * c; i++)
532
- {
533
- dummy_q[i] = i;
534
- }
535
-
536
- // std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
537
- // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
538
-
539
- // 4. Allocate a `ggml_backend_buffer` to store all tensors
540
- ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors (ctx, backend);
541
-
542
- // 5. Copy tensor data from main memory (RAM) to backend buffer
543
- ggml_backend_tensor_set (inp_raw, dummy_q.data (), 0 , ggml_nbytes (inp_raw));
544
-
545
- // 6. Create a `ggml_cgraph` for mul_mat operation
546
- struct ggml_cgraph * gf = NULL ;
547
- struct ggml_context * ctx0 = NULL ;
548
-
549
- // create a temporally context to build the graph
550
- struct ggml_init_params params0 = {
551
- /* .mem_size =*/ ggml_tensor_overhead ()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead (),
552
- /* .mem_buffer =*/ NULL ,
553
- /* .no_alloc =*/ true , // the tensors will be allocated later by ggml_gallocr_alloc_graph()
554
- };
555
- ctx0 = ggml_init (params0);
556
- gf = ggml_new_graph (ctx0);
557
- /*
558
- Compute graph
559
- */
560
- struct ggml_tensor * inp = ggml_cont (ctx0, ggml_permute (ctx0, inp_raw, 1 , 2 , 0 , 3 )); // [w, h, c, b] -> [c, w, h, b]
561
-
562
- inp = ggml_reshape_4d (
563
- ctx0, inp,
564
- c * 2 , patches_w / 2 , patches_h, batch_size);
565
- inp = ggml_reshape_4d (
566
- ctx0, inp,
567
- c * 2 , patches_w / 2 , 2 , batch_size * (patches_h / 2 ));
568
- inp = ggml_cont (ctx0, ggml_permute (ctx0, inp, 0 , 2 , 1 , 3 ));
569
- inp = ggml_reshape_3d (
570
- ctx0, inp,
571
- c, patches_w * patches_h, batch_size);
572
-
573
- // Add "result" tensor and all of its dependencies to the cgraph
574
- ggml_build_forward_expand (gf, inp);
575
-
576
- // 7. Create a `ggml_gallocr` for cgraph computation
577
- ggml_gallocr_t allocr = ggml_gallocr_new (ggml_backend_get_default_buffer_type (backend));
578
- ggml_gallocr_alloc_graph (allocr, gf);
579
-
580
- // 9. Run the computation
581
- int n_threads = 1 ; // Optional: number of threads to perform some operations with multi-threading
582
- if (ggml_backend_is_cpu (backend)) {
583
- ggml_backend_cpu_set_n_threads (backend, n_threads);
584
- }
585
- ggml_backend_graph_compute (backend, gf);
586
-
587
- // 10. Retrieve results (output tensors)
588
- // in this example, output tensor is always the last tensor in the graph
589
- struct ggml_tensor * result = inp;
590
- // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
591
- float * result_data = (float *)malloc (ggml_nbytes (result));
592
- // because the tensor data is stored in device buffer, we need to copy it back to RAM
593
- ggml_backend_tensor_get (result, result_data, 0 , ggml_nbytes (result));
594
- const std::string bin_file = " patch_layout_" + backend_name +" .bin" ;
595
- std::ofstream outFile (bin_file, std::ios::binary);
596
-
597
- if (outFile.is_open ()) {
598
- outFile.write (reinterpret_cast <const char *>(result_data), ggml_nbytes (result));
599
- outFile.close ();
600
- std::cout << " Data successfully written to " + bin_file << std::endl;
601
- } else {
602
- std::cerr << " Error opening file!" << std::endl;
603
- }
604
-
605
- free (result_data);
606
- // 11. Free memory and exit
607
- ggml_free (ctx0);
608
- ggml_gallocr_free (allocr);
609
- ggml_free (ctx);
610
- ggml_backend_buffer_free (buffer);
611
- ggml_backend_free (backend);
612
- }
613
-
614
- static void debug_test_get_rows () {
615
- // 1. Initialize backend
616
- ggml_backend_t backend = NULL ;
617
- std::string backend_name = " " ;
618
- // #ifdef GGML_USE_CUDA
619
- // fprintf(stderr, "%s: using CUDA backend\n", __func__);
620
- // backend = ggml_backend_cuda_init(0); // init device 0
621
- // backend_name = "cuda";
622
- // if (!backend) {
623
- // fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
624
- // }
625
- // #endif
626
- // if there aren't GPU Backends fallback to CPU backend
627
- if (!backend) {
628
- backend = ggml_backend_cpu_init ();
629
- backend_name = " cpu" ;
630
- }
631
-
632
- // Calculate the size needed to allocate
633
- size_t ctx_size = 0 ;
634
- ctx_size += 128 * ggml_tensor_overhead (); // tensors
635
- // no need to allocate anything else!
636
-
637
- // 2. Allocate `ggml_context` to store tensor data
638
- struct ggml_init_params params = {
639
- /* .mem_size =*/ ctx_size,
640
- /* .mem_buffer =*/ NULL ,
641
- /* .no_alloc =*/ true , // the tensors will be allocated later by ggml_backend_alloc_ctx_tensors()
642
- };
643
- struct ggml_context * ctx = ggml_init (params);
644
-
645
- const int tokens = 30 ;
646
- struct ggml_tensor * inp_raw = ggml_new_tensor_3d (ctx, GGML_TYPE_F32, 128 , 3 , tokens * 2 );
647
- ggml_set_name (inp_raw, " inp_raw" );
648
- ggml_set_input (inp_raw);
649
-
650
- struct ggml_tensor * pos = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, 4 , tokens);
651
- // struct ggml_tensor * pos = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, tokens * 4);
652
- ggml_set_name (pos, " pos" );
653
- ggml_set_input (pos);
654
-
655
- struct ggml_tensor * ind = ggml_new_tensor_1d (ctx, GGML_TYPE_I32, tokens);
656
- ggml_set_name (ind, " ind" );
657
- ggml_set_input (ind);
658
-
659
- struct ggml_tensor * ind_2d = ggml_new_tensor_2d (ctx, GGML_TYPE_I32, 1 , tokens);
660
- ggml_set_name (ind_2d, " ind_2d" );
661
- ggml_set_input (ind_2d);
662
-
663
- std::vector<float > dummy_q;
664
- dummy_q.resize (128 * 3 * inp_raw->ne [2 ]);
665
- for (int i = 0 ; i < inp_raw->ne [2 ]; i ++) {
666
- for (int j = 0 ; j < 3 ; j ++) {
667
- int offset = i * 128 * 3 + j * 128 ;
668
- std::fill (dummy_q.begin () + offset, dummy_q.begin () + offset + 128 , 0.1 * i);
669
- }
670
- }
671
- // std::fill(dummy_q.begin(), dummy_q.end(), 0.1);
672
- // memcpy(inp_raw->data, dummy_q.data(), 128 * 12 * 30 * ggml_element_size(inp_raw));
673
-
674
- std::vector<int > pos_id;
675
- pos_id.resize (tokens * 4 );
676
- for (int i = 0 ; i < tokens; i ++) {
677
- pos_id[i] = i;
678
- pos_id[i + tokens * 1 ] = i + 10 ;
679
- pos_id[i + tokens * 2 ] = i + 20 ;
680
- pos_id[i + tokens * 3 ] = i + 30 ;
681
- }
682
-
683
- std::vector<int > remap_ind;
684
- remap_ind.resize (tokens * 4 );
685
- for (int i = 0 ; i < tokens; i ++) {
686
- remap_ind[i] = tokens - i - 1 ;
687
- }
688
-
689
- // 4. Allocate a `ggml_backend_buffer` to store all tensors
690
- ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors (ctx, backend);
691
-
692
- // 5. Copy tensor data from main memory (RAM) to backend buffer
693
- ggml_backend_tensor_set (inp_raw, dummy_q.data (), 0 , ggml_nbytes (inp_raw));
694
- ggml_backend_tensor_set (pos, pos_id.data (), 0 , ggml_nbytes (pos));
695
- ggml_backend_tensor_set (ind, remap_ind.data (), 0 , ggml_nbytes (ind));
696
- ggml_backend_tensor_set (ind_2d, remap_ind.data (), 0 , ggml_nbytes (ind_2d));
697
-
698
- // 6. Create a `ggml_cgraph` for mul_mat operation
699
- struct ggml_cgraph * gf = NULL ;
700
- struct ggml_context * ctx_cgraph = NULL ;
701
-
702
- // create a temporally context to build the graph
703
- struct ggml_init_params params0 = {
704
- /* .mem_size =*/ ggml_tensor_overhead ()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead (),
705
- /* .mem_buffer =*/ NULL ,
706
- /* .no_alloc =*/ true , // the tensors will be allocated later by ggml_gallocr_alloc_graph()
707
- };
708
- ctx_cgraph = ggml_init (params0);
709
- gf = ggml_new_graph (ctx_cgraph);
710
-
711
- // ne = [128, 1, 30, 1]
712
- auto x = ggml_reshape_2d (ctx_cgraph, inp_raw, 128 * 3 * 2 , tokens);
713
- struct ggml_tensor * result0 = ggml_get_rows (
714
- ctx_cgraph, x, ind);
715
- result0 = ggml_reshape_3d (ctx_cgraph, result0, 128 , 3 , tokens * 2 );
716
-
717
- struct ggml_tensor * result1 = ggml_get_rows (
718
- ctx_cgraph, pos, ind);
719
-
720
- // Add "result" tensor and all of its dependencies to the cgraph
721
- ggml_build_forward_expand (gf, result0);
722
- ggml_build_forward_expand (gf, result1);
723
-
724
- // 7. Create a `ggml_gallocr` for cgraph computation
725
- ggml_gallocr_t allocr = ggml_gallocr_new (ggml_backend_get_default_buffer_type (backend));
726
- ggml_gallocr_alloc_graph (allocr, gf);
727
-
728
- // 9. Run the computation
729
- int n_threads = 1 ; // Optional: number of threads to perform some operations with multi-threading
730
- if (ggml_backend_is_cpu (backend)) {
731
- ggml_backend_cpu_set_n_threads (backend, n_threads);
732
- }
733
- ggml_backend_graph_compute (backend, gf);
734
-
735
- // 10. Retrieve results (output tensors)
736
- // in this example, output tensor is always the last tensor in the graph
737
- struct ggml_tensor * result = result0;
738
- // struct ggml_tensor * result = gf->nodes[gf->n_nodes - 1];
739
- float * result_data = (float *)malloc (ggml_nbytes (result));
740
- // because the tensor data is stored in device buffer, we need to copy it back to RAM
741
- ggml_backend_tensor_get (result, result_data, 0 , ggml_nbytes (result));
742
- const std::string bin_file = " getrows_" + backend_name +" _0.bin" ;
743
- std::ofstream outFile (bin_file, std::ios::binary);
744
-
745
- if (outFile.is_open ()) {
746
- outFile.write (reinterpret_cast <const char *>(result_data), ggml_nbytes (result));
747
- outFile.close ();
748
- std::cout << " Data successfully written to " + bin_file << std::endl;
749
- } else {
750
- std::cerr << " Error opening file!" << std::endl;
751
- }
752
-
753
- free (result_data);
754
- // 11. Free memory and exit
755
- ggml_free (ctx_cgraph);
756
- ggml_gallocr_free (allocr);
757
- ggml_free (ctx);
758
- ggml_backend_buffer_free (buffer);
759
- ggml_backend_free (backend);
760
- }
761
-
762
-
763
489
enum model_output_type {
764
490
conv3d,
765
491
patch_embed,
@@ -955,9 +681,6 @@ int main(int argc, char ** argv) {
955
681
// debug_test_mrope_2d();
956
682
debug_dump_img_embed (ctx_llava, model_output_type::final_layer);
957
683
// debug_dump_img_embed(ctx_llava, model_output_type::last_attn_layer);
958
- // debug_test_get_rows();
959
- // dump_win_attn_mask();
960
- // debug_patch_layout();
961
684
962
685
llama_perf_context_print (ctx_llava->ctx_llama );
963
686
ctx_llava->model = NULL ;
0 commit comments