examples : add basic metal perf tool [no ci]

ggerganov · ggerganov · commit d7b5934301bf · 2024-09-30T16:23:36.000+03:00
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -35,6 +35,9 @@ else()
     add_subdirectory(main)
     add_subdirectory(parallel)
     add_subdirectory(passkey)
+    if (GGML_METAL)
+        add_subdirectory(perf-metal)
+    endif()
     add_subdirectory(perplexity)
     add_subdirectory(quantize-stats)
     add_subdirectory(quantize)
diff --git a/examples/perf-metal/CMakeLists.txt b/examples/perf-metal/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(TARGET llama-perf-metal)
+add_executable(${TARGET} perf-metal.cpp)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/perf-metal/perf-metal.cpp b/examples/perf-metal/perf-metal.cpp
@@ -0,0 +1,144 @@
+#include "ggml.h"
+#include "ggml-alloc.h"
+#include "ggml-backend.h"
+#include "ggml-metal.h"
+
+#include <cstdio>
+#include <vector>
+#include <thread>
+
+int main(int argc, char ** argv) {
+    int n_op = 1024;
+    int n_iter = 128;
+
+    if (argc > 1) {
+        n_op = std::atoi(argv[1]);
+    }
+
+    if (argc > 2) {
+        n_iter = std::atoi(argv[2]);
+    }
+
+    printf("%s: n_op = %d, n_iter = %d\n", __func__, n_op, n_iter);
+
+    const int ne00 = 8;
+    const int ne01 = 8;
+    const int ne11 = 8;
+
+    std::vector<float> data0(ne00*ne01, 1.0f);
+    std::vector<float> data1(ne00*ne01, 1.0f/ne00);
+
+    ggml_backend_t backend = ggml_backend_metal_init();
+    if (!backend) {
+        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+        return 1;
+    }
+
+    const size_t ctx_size = 2 * ggml_tensor_overhead();
+
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ true,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+
+    struct ggml_tensor * t0 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne00, ne01);
+    struct ggml_tensor * t1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne00, ne11);
+
+    ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx, backend);
+
+    ggml_backend_tensor_set(t0, data0.data(), 0, ggml_nbytes(t0));
+    ggml_backend_tensor_set(t1, data1.data(), 0, ggml_nbytes(t1));
+
+    struct ggml_cgraph * gf = NULL;
+
+    struct ggml_context * ctx_cgraph = NULL;
+
+    // create a dummy compute graph:
+    //
+    // x = mul_mat(t0, t1)
+    // x = x * 1.0f
+    // x = mul_mat(x, t1)
+    // x = x * 1.0f
+    // ... repeat n_op times ...
+    //
+    {
+        struct ggml_init_params params0 = {
+            /*.mem_size   =*/ 4*n_op*ggml_tensor_overhead() + ggml_graph_overhead(),
+            /*.mem_buffer =*/ NULL,
+            /*.no_alloc   =*/ true,
+        };
+        ctx_cgraph = ggml_init(params0);
+
+        gf = ggml_new_graph_custom(ctx_cgraph, 4*n_op, false);
+
+        struct ggml_tensor * cur = ggml_mul_mat(ctx_cgraph, t0, t1);
+        cur = ggml_scale(ctx_cgraph, cur, 1.0f);
+
+        for (int i = 0; i < n_op - 1; i++) {
+            cur = ggml_mul_mat(ctx_cgraph, cur, t1);
+            cur = ggml_scale(ctx_cgraph, cur, 1.0f);
+        }
+
+        ggml_build_forward_expand(gf, cur);
+    }
+
+    printf("%s: graph nodes = %d\n", __func__, ggml_graph_n_nodes(gf));
+
+    ggml_gallocr_t allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
+    ggml_gallocr_alloc_graph(allocr, gf);
+
+    for (int n_thread = 1; n_thread < std::thread::hardware_concurrency(); n_thread++) {
+        ggml_backend_metal_set_n_cb(backend, n_thread);
+
+        // warm-up
+        ggml_backend_graph_compute(backend, gf);
+
+        const int64_t t_start = ggml_time_us();
+
+        for (int iter = 0; iter < n_iter; iter++) {
+            ggml_backend_graph_compute(backend, gf);
+        }
+
+        const int64_t t_end = ggml_time_us();
+
+        // actual trace
+        if (n_thread == 4) {
+            ggml_backend_metal_capture_next_compute(backend);
+            ggml_backend_graph_compute(backend, gf);
+            ggml_backend_metal_capture_next_compute(backend);
+            ggml_backend_graph_compute(backend, gf);
+            ggml_backend_metal_capture_next_compute(backend);
+            ggml_backend_graph_compute(backend, gf);
+
+            printf("%s: trace dumped\n", __func__);
+        }
+
+        printf("%s: n_thread = %d, time = %f ms\n", __func__, n_thread, (t_end - t_start) / 1000.0 / n_iter);
+    }
+
+    {
+        struct ggml_tensor * res = ggml_graph_node(gf, -1);
+
+        std::vector<float> data(res->ne[0] * res->ne[1], 0.0f);
+
+        ggml_backend_tensor_get(res, data.data(), 0, ggml_nbytes(res));
+
+        for (int i1 = 0; i1 < res->ne[1]; i1++) {
+            for (int i0 = 0; i0 < res->ne[0]; i0++) {
+                printf("%f ", data[i1*res->ne[0] + i0]);
+            }
+            printf("\n");
+        }
+    }
+
+    // 11. Free memory and exit
+    ggml_free(ctx_cgraph);
+    ggml_gallocr_free(allocr);
+    ggml_free(ctx);
+    ggml_backend_buffer_free(buffer);
+    ggml_backend_free(backend);
+    return 0;
+}
+
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
@@ -234,6 +234,7 @@
     bool support_simdgroup_mm;
 
     bool should_capture_next_compute;
+    bool capture_started;
 
     // abort ggml_metal_graph_compute if callback returns true
     ggml_abort_callback abort_callback;
@@ -456,6 +457,7 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
     GGML_METAL_LOG_INFO("%s: hasUnifiedMemory              = %s\n",       __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
 
     ctx->should_capture_next_compute = false;
+    ctx->capture_started = false;
 
 #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
     if (@available(macOS 10.12, iOS 16.0, *)) {
@@ -893,13 +895,19 @@ static enum ggml_status ggml_metal_graph_compute(
     if (should_capture) {
         ctx->should_capture_next_compute = false;
 
-        MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
-        descriptor.captureObject = ctx->queue;
+        if (!ctx->capture_started) {
+            MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
+            descriptor.captureObject = ctx->queue;
+            descriptor.destination = MTLCaptureDestinationGPUTraceDocument;
+            descriptor.outputURL = [NSURL fileURLWithPath:[NSString stringWithFormat:@"/tmp/perf-metal.gputrace"]];
 
-        NSError * error = nil;
-        if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
-            GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
-            GGML_ABORT("capture failed");
+            NSError * error = nil;
+            if (![[MTLCaptureManager sharedCaptureManager] startCaptureWithDescriptor:descriptor error:&error]) {
+                GGML_METAL_LOG_ERROR("%s: error: unable to start capture '%s'\n", __func__, [[error localizedDescription] UTF8String]);
+                GGML_ABORT("capture failed");
+            } else {
+                ctx->capture_started = true;
+            }
         }
     }
 
@@ -3066,7 +3074,7 @@ static enum ggml_status ggml_metal_graph_compute(
         [next_buffer commit];
     }
 
-    if (should_capture) {
+    if (!should_capture && ctx->capture_started) {
         [[MTLCaptureManager sharedCaptureManager] stopCapture];
     }