Skip to content

Commit c4411d5

Browse files
threads: add simple barrier test
This test does lots of small, parallel matmul ops where the barriers in between dominate the overhead.
1 parent ed094a5 commit c4411d5

File tree

2 files changed

+94
-0
lines changed

2 files changed

+94
-0
lines changed

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ llama_target_and_test(test-grammar-parser.cpp)
119119
llama_target_and_test(test-llama-grammar.cpp)
120120
llama_target_and_test(test-grammar-integration.cpp)
121121
llama_target_and_test(test-grad0.cpp)
122+
llama_target_and_test(test-barrier.cpp)
122123
# llama_target_and_test(test-opt.cpp) # SLOW
123124
llama_target_and_test(test-backend-ops.cpp)
124125

tests/test-barrier.cpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#include "ggml.h"
2+
#include "ggml-backend.h"
3+
4+
#include <chrono>
5+
#include <iostream>
6+
#include <cstdio>
7+
#include <cstdlib>
8+
#include <cassert>
9+
#include <vector>
10+
11+
#define MAX_NARGS 2
12+
13+
int main(int argc, char *argv[]) {
14+
15+
int n_threads = 4;
16+
int n_rounds = 100;
17+
18+
if (argc > 1) {
19+
n_threads = std::atoi(argv[1]);
20+
}
21+
22+
if (argc > 2) {
23+
n_rounds = std::atoi(argv[2]);
24+
}
25+
26+
struct ggml_init_params params = {
27+
/* .mem_size = */ 1024*1024*1024,
28+
/* .mem_buffer = */ NULL,
29+
/* .no_alloc = */ false,
30+
};
31+
32+
struct ggml_context * ctx = ggml_init(params);
33+
34+
// Create graph
35+
struct ggml_cgraph * gf = ggml_new_graph(ctx);
36+
37+
// Lots of small, parallel ops where barriers in between will dominate
38+
struct ggml_tensor * out = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 64);
39+
for (int i = 0; i < 1000; i++) {
40+
struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 64, 128);
41+
out = ggml_mul_mat(ctx, a, out);
42+
43+
struct ggml_tensor * d = ggml_new_tensor_2d(ctx, GGML_TYPE_Q4_0, 128, 64);
44+
out = ggml_mul_mat(ctx, d, out);
45+
}
46+
47+
ggml_build_forward_expand(gf, out);
48+
int n_nodes = ggml_graph_n_nodes(gf);
49+
50+
// Create threadpool
51+
struct ggml_threadpool_params tpp = ggml_threadpool_params_default(n_threads);
52+
struct ggml_threadpool* threadpool = ggml_threadpool_new(&tpp);
53+
if (!threadpool) {
54+
fprintf(stderr, "threadpool create failed : n_threads %d\n", n_threads);
55+
exit(1);
56+
}
57+
58+
// Create compute plan
59+
struct ggml_cplan cplan = ggml_graph_plan(gf, n_threads, threadpool);
60+
61+
std::vector<uint8_t> work_data(cplan.work_size);
62+
cplan.work_data = work_data.data();
63+
64+
std::cerr << "graph-compute with"
65+
<< "\n n_threads: " << n_threads
66+
<< "\n n_nodes: " << n_nodes
67+
<< "\n n_rounds: " << n_rounds
68+
<< "\n";
69+
// ggml_graph_print(gf);
70+
71+
// Warmup
72+
ggml_graph_compute(gf, &cplan);
73+
74+
auto t0 = std::chrono::high_resolution_clock::now();
75+
76+
for (int i=0; i < n_rounds; i++) {
77+
ggml_graph_compute(gf, &cplan);
78+
}
79+
80+
auto t1 = std::chrono::high_resolution_clock::now();
81+
82+
auto usec = std::chrono::duration_cast<std::chrono::microseconds>(t1-t0).count();
83+
auto nsec = std::chrono::duration_cast<std::chrono::nanoseconds>(t1-t0).count();
84+
std::cerr << "graph-compute took " << usec << " usec "
85+
<< "\n " << (float) usec / n_rounds << " usec per-iter"
86+
<< "\n " << (float) nsec / (n_rounds * n_nodes) << " nsec per-node"
87+
<< "\n";
88+
89+
ggml_free(ctx);
90+
ggml_threadpool_free(threadpool);
91+
92+
return 0;
93+
}

0 commit comments

Comments
 (0)