|
| 1 | +#include "ggml.h" |
| 2 | +#include "ggml-backend.h" |
| 3 | + |
| 4 | +#include <chrono> |
| 5 | +#include <iostream> |
| 6 | +#include <cstdio> |
| 7 | +#include <cstdlib> |
| 8 | +#include <cassert> |
| 9 | +#include <vector> |
| 10 | + |
| 11 | +#define MAX_NARGS 2 |
| 12 | + |
| 13 | +int main(int argc, char *argv[]) { |
| 14 | + |
| 15 | + int n_threads = 4; |
| 16 | + int n_rounds = 100; |
| 17 | + |
| 18 | + if (argc > 1) { |
| 19 | + n_threads = std::atoi(argv[1]); |
| 20 | + } |
| 21 | + |
| 22 | + if (argc > 2) { |
| 23 | + n_rounds = std::atoi(argv[2]); |
| 24 | + } |
| 25 | + |
| 26 | + struct ggml_init_params params = { |
| 27 | + /* .mem_size = */ 1024*1024*1024, |
| 28 | + /* .mem_buffer = */ NULL, |
| 29 | + /* .no_alloc = */ false, |
| 30 | + }; |
| 31 | + |
| 32 | + struct ggml_context * ctx = ggml_init(params); |
| 33 | + |
| 34 | + // Create graph |
| 35 | + struct ggml_cgraph * gf = ggml_new_graph(ctx); |
| 36 | + |
| 37 | + // Lots of small, parallel ops where barriers in between will dominate |
| 38 | + struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64); |
| 39 | + for (int i = 0; i < 1000; i++) { |
| 40 | + struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128); |
| 41 | + out = ggml_mul_mat(ctx, a, out); |
| 42 | + |
| 43 | + struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64); |
| 44 | + out = ggml_mul_mat(ctx, d, out); |
| 45 | + } |
| 46 | + |
| 47 | + ggml_build_forward_expand(gf, out); |
| 48 | + int n_nodes = ggml_graph_n_nodes(gf); |
| 49 | + |
| 50 | + // Create threadpool |
| 51 | + struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads); |
| 52 | + struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp); |
| 53 | + if (!threadpool) { |
| 54 | + fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads); |
| 55 | + exit(1); |
| 56 | + } |
| 57 | + |
| 58 | + // Create compute plan |
| 59 | + struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool); |
| 60 | + |
| 61 | + std::vector<uint8_t> work_data(cplan.work_size); |
| 62 | + cplan.work_data = work_data.data(); |
| 63 | + |
| 64 | + std::cerr << "graph-compute with" |
| 65 | + << "\n n_threads: " << n_threads |
| 66 | + << "\n n_nodes: " << n_nodes |
| 67 | + << "\n n_rounds: " << n_rounds |
| 68 | + << "\n"; |
| 69 | + // ggml_graph_print(gf); |
| 70 | + |
| 71 | + // Warmup |
| 72 | + ggml_graph_compute(gf, &cplan); |
| 73 | + |
| 74 | + auto t0 = std::chrono::high_resolution_clock::now(); |
| 75 | + |
| 76 | + for (int i=0; i < n_rounds; i++) { |
| 77 | + ggml_graph_compute(gf, &cplan); |
| 78 | + } |
| 79 | + |
| 80 | + auto t1 = std::chrono::high_resolution_clock::now(); |
| 81 | + |
| 82 | + auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count(); |
| 83 | + auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count(); |
| 84 | + std::cerr << "graph-compute took " << usec << " usec " |
| 85 | + << "\n " << (float) usec / n_rounds << " usec per-iter" |
| 86 | + << "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node" |
| 87 | + << "\n"; |
| 88 | + |
| 89 | + ggml_free(ctx); |
| 90 | + ggml_threadpool_free(threadpool); |
| 91 | + |
| 92 | + return 0; |
| 93 | +} |
0 commit comments