Skip to content

Commit d7b5934

Browse files
committed
examples : add basic metal perf tool [no ci]
1 parent c919d5d commit d7b5934

File tree

4 files changed

+167
-7
lines changed

4 files changed

+167
-7
lines changed

examples/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ else()
3535
add_subdirectory(main)
3636
add_subdirectory(parallel)
3737
add_subdirectory(passkey)
38+
if (GGML_METAL)
39+
add_subdirectory(perf-metal)
40+
endif()
3841
add_subdirectory(perplexity)
3942
add_subdirectory(quantize-stats)
4043
add_subdirectory(quantize)

examples/perf-metal/CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
set(TARGET llama-perf-metal)
2+
add_executable(${TARGET} perf-metal.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)

examples/perf-metal/perf-metal.cpp

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#include "ggml.h"
2+
#include "ggml-alloc.h"
3+
#include "ggml-backend.h"
4+
#include "ggml-metal.h"
5+
6+
#include <cstdio>
7+
#include <vector>
8+
#include <thread>
9+
10+
int main(int argc, char ** argv) {
11+
int n_op = 1024;
12+
int n_iter = 128;
13+
14+
if (argc > 1) {
15+
n_op = std::atoi(argv[1]);
16+
}
17+
18+
if (argc > 2) {
19+
n_iter = std::atoi(argv[2]);
20+
}
21+
22+
printf("%s: n_op = %d, n_iter = %d\n", __func__, n_op, n_iter);
23+
24+
const int ne00 = 8;
25+
const int ne01 = 8;
26+
const int ne11 = 8;
27+
28+
std::vector<float> data0(ne00*ne01, 1.0f);
29+
std::vector<float> data1(ne00*ne01, 1.0f/ne00);
30+
31+
ggml_backend_t backend = ggml_backend_metal_init();
32+
if (!backend) {
33+
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
34+
return 1;
35+
}
36+
37+
const size_t ctx_size = 2 * ggml_tensor_overhead();
38+
39+
struct ggml_init_params params = {
40+
/*.mem_size =*/ ctx_size,
41+
/*.mem_buffer =*/ NULL,
42+
/*.no_alloc =*/ true,
43+
};
44+
struct ggml_context * ctx = ggml_init(params);
45+
46+
struct ggml_tensor * t0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne00, ne01);
47+
struct ggml_tensor * t1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne00, ne11);
48+
49+
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
50+
51+
ggml_backend_tensor_set(t0, data0.data(), 0, ggml_nbytes(t0));
52+
ggml_backend_tensor_set(t1, data1.data(), 0, ggml_nbytes(t1));
53+
54+
struct ggml_cgraph * gf = NULL;
55+
56+
struct ggml_context * ctx_cgraph = NULL;
57+
58+
// create a dummy compute graph:
59+
//
60+
// x = mul_mat(t0, t1)
61+
// x = x * 1.0f
62+
// x = mul_mat(x, t1)
63+
// x = x * 1.0f
64+
// ... repeat n_op times ...
65+
//
66+
{
67+
struct ggml_init_params params0 = {
68+
/*.mem_size =*/ 4*n_op*ggml_tensor_overhead() + ggml_graph_overhead(),
69+
/*.mem_buffer =*/ NULL,
70+
/*.no_alloc =*/ true,
71+
};
72+
ctx_cgraph = ggml_init(params0);
73+
74+
gf = ggml_new_graph_custom(ctx_cgraph, 4*n_op, false);
75+
76+
struct ggml_tensor * cur = ggml_mul_mat(ctx_cgraph, t0, t1);
77+
cur = ggml_scale(ctx_cgraph, cur, 1.0f);
78+
79+
for (int i = 0; i < n_op - 1; i++) {
80+
cur = ggml_mul_mat(ctx_cgraph, cur, t1);
81+
cur = ggml_scale(ctx_cgraph, cur, 1.0f);
82+
}
83+
84+
ggml_build_forward_expand(gf, cur);
85+
}
86+
87+
printf("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf));
88+
89+
ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
90+
ggml_gallocr_alloc_graph(allocr, gf);
91+
92+
for (int n_thread = 1; n_thread < std::thread::hardware_concurrency(); n_thread++) {
93+
ggml_backend_metal_set_n_cb(backend, n_thread);
94+
95+
// warm-up
96+
ggml_backend_graph_compute(backend, gf);
97+
98+
const int64_t t_start = ggml_time_us();
99+
100+
for (int iter = 0; iter < n_iter; iter++) {
101+
ggml_backend_graph_compute(backend, gf);
102+
}
103+
104+
const int64_t t_end = ggml_time_us();
105+
106+
// actual trace
107+
if (n_thread == 4) {
108+
ggml_backend_metal_capture_next_compute(backend);
109+
ggml_backend_graph_compute(backend, gf);
110+
ggml_backend_metal_capture_next_compute(backend);
111+
ggml_backend_graph_compute(backend, gf);
112+
ggml_backend_metal_capture_next_compute(backend);
113+
ggml_backend_graph_compute(backend, gf);
114+
115+
printf("%s: trace dumped\n", __func__);
116+
}
117+
118+
printf("%s: n_thread = %d, time = %f ms\n", __func__, n_thread, (t_end - t_start) / 1000.0 / n_iter);
119+
}
120+
121+
{
122+
struct ggml_tensor * res = ggml_graph_node(gf, -1);
123+
124+
std::vector<float> data(res->ne[0] * res->ne[1], 0.0f);
125+
126+
ggml_backend_tensor_get(res, data.data(), 0, ggml_nbytes(res));
127+
128+
for (int i1 = 0; i1 < res->ne[1]; i1++) {
129+
for (int i0 = 0; i0 < res->ne[0]; i0++) {
130+
printf("%f ", data[i1*res->ne[0] + i0]);
131+
}
132+
printf("\n");
133+
}
134+
}
135+
136+
// 11. Free memory and exit
137+
ggml_free(ctx_cgraph);
138+
ggml_gallocr_free(allocr);
139+
ggml_free(ctx);
140+
ggml_backend_buffer_free(buffer);
141+
ggml_backend_free(backend);
142+
return 0;
143+
}
144+

ggml/src/ggml-metal.m

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,7 @@
234234
bool support_simdgroup_mm;
235235

236236
bool should_capture_next_compute;
237+
bool capture_started;
237238

238239
// abort ggml_metal_graph_compute if callback returns true
239240
ggml_abort_callback abort_callback;
@@ -456,6 +457,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
456457
GGML_METAL_LOG_INFO("%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
457458

458459
ctx->should_capture_next_compute = false;
460+
ctx->capture_started = false;
459461

460462
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
461463
if (@available(macOS 10.12, iOS 16.0, *)) {
@@ -893,13 +895,19 @@ static enum ggml_status ggml_metal_graph_compute(
893895
if (should_capture) {
894896
ctx->should_capture_next_compute = false;
895897

896-
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
897-
descriptor.captureObject = ctx->queue;
898+
if (!ctx->capture_started) {
899+
MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
900+
descriptor.captureObject = ctx->queue;
901+
descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
902+
descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
898903

899-
NSError * error = nil;
900-
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
901-
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
902-
GGML_ABORT("capture failed");
904+
NSError * error = nil;
905+
if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
906+
GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
907+
GGML_ABORT("capture failed");
908+
} else {
909+
ctx->capture_started = true;
910+
}
903911
}
904912
}
905913

@@ -3066,7 +3074,7 @@ static enum ggml_status ggml_metal_graph_compute(
30663074
[next_buffer commit];
30673075
}
30683076

3069-
if (should_capture) {
3077+
if (!should_capture && ctx->capture_started) {
30703078
[[MTLCaptureManager sharedCaptureManager] stopCapture];
30713079
}
30723080

0 commit comments

Comments
 (0)