Skip to content

vulkan: use timestamp queries for GGML_VULKAN_PERF #13817

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 27, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,6 @@ option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks"
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
Expand Down
4 changes: 0 additions & 4 deletions ggml/src/ggml-vulkan/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -109,10 +109,6 @@ if (Vulkan_FOUND)
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
endif()

if (GGML_VULKAN_PERF)
add_compile_definitions(GGML_VULKAN_PERF)
endif()

if (GGML_VULKAN_VALIDATE)
add_compile_definitions(GGML_VULKAN_VALIDATE)
endif()
Expand Down
85 changes: 69 additions & 16 deletions ggml/src/ggml-vulkan/ggml-vulkan.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include "ggml-vulkan.h"
#include <vulkan/vulkan_core.h>
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_PERF) || defined(GGML_VULKAN_CHECK_RESULTS)
#if defined(GGML_VULKAN_RUN_TESTS) || defined(GGML_VULKAN_CHECK_RESULTS)
#include <chrono>
#include "ggml-cpu.h"
#endif
Expand Down Expand Up @@ -184,9 +184,7 @@ static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
#ifdef GGML_VULKAN_MEMORY_DEBUG
class vk_memory_logger;
#endif
#ifdef GGML_VULKAN_PERF
class vk_perf_logger;
#endif
static void ggml_vk_destroy_buffer(vk_buffer& buf);

static constexpr uint32_t mul_mat_vec_max_cols = 8;
Expand Down Expand Up @@ -442,9 +440,11 @@ struct vk_device_struct {
#ifdef GGML_VULKAN_MEMORY_DEBUG
std::unique_ptr<vk_memory_logger> memory_logger;
#endif
#ifdef GGML_VULKAN_PERF

// for GGML_VK_PERF_LOGGER
std::unique_ptr<vk_perf_logger> perf_logger;
#endif
vk::QueryPool query_pool;
uint32_t num_queries;

~vk_device_struct() {
VK_LOG_DEBUG("destroy device " << name);
Expand Down Expand Up @@ -828,8 +828,6 @@ class vk_memory_logger {
#define VK_LOG_MEMORY(msg) ((void) 0)
#endif // GGML_VULKAN_MEMORY_DEBUG

#if defined(GGML_VULKAN_PERF)

class vk_perf_logger {
public:
void print_timings() {
Expand All @@ -839,7 +837,7 @@ class vk_perf_logger {
for (const auto& time : t.second) {
total += time;
}
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " ms" << std::endl;
std::cerr << t.first << ": " << t.second.size() << " x " << (total / t.second.size() / 1000.0) << " us" << std::endl;
}

timings.clear();
Expand Down Expand Up @@ -868,7 +866,6 @@ class vk_perf_logger {
private:
std::map<std::string, std::vector<uint64_t>> timings;
};
#endif // GGML_VULKAN_PERF

struct ggml_backend_vk_context {
std::string name;
Expand Down Expand Up @@ -958,6 +955,8 @@ struct vk_instance_t {
static bool vk_instance_initialized = false;
static vk_instance_t vk_instance;

static bool vk_perf_logger_enabled = false;

#ifdef GGML_VULKAN_CHECK_RESULTS
static size_t vk_skip_checks;
static size_t vk_output_tensor;
Expand Down Expand Up @@ -2757,9 +2756,9 @@ static vk_device ggml_vk_get_device(size_t idx) {
#ifdef GGML_VULKAN_MEMORY_DEBUG
device->memory_logger = std::unique_ptr<vk_memory_logger>(new vk_memory_logger());
#endif
#ifdef GGML_VULKAN_PERF
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
#endif
if (vk_perf_logger_enabled) {
device->perf_logger = std::unique_ptr<vk_perf_logger>(new vk_perf_logger());
}

size_t dev_num = vk_instance.device_indices[idx];

Expand Down Expand Up @@ -3547,6 +3546,8 @@ static void ggml_vk_instance_init() {
vk_instance.instance = vk::createInstance(instance_create_info);
vk_instance_initialized = true;

vk_perf_logger_enabled = getenv("GGML_VK_PERF_LOGGER") != nullptr;

size_t num_available_devices = vk_instance.instance.enumeratePhysicalDevices().size();

// Emulate behavior of CUDA_VISIBLE_DEVICES for Vulkan
Expand Down Expand Up @@ -8884,7 +8885,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod

ctx->tensor_ctxs[node_idx] = compute_ctx;

#if defined(GGML_VULKAN_CHECK_RESULTS) || defined(GGML_VULKAN_PERF)
#if defined(GGML_VULKAN_CHECK_RESULTS)
// Force context reset on each node so that each tensor ends up in its own context
// and can be run and compared to its CPU equivalent separately
last_node = true;
Expand Down Expand Up @@ -9504,6 +9505,29 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
bool first_node_in_batch = true; // true if next node will be first node in a batch
int submit_node_idx = 0; // index to first node in a batch

vk_context compute_ctx;
if (vk_perf_logger_enabled) {
// allocate/resize the query pool
if (ctx->device->num_queries < cgraph->n_nodes + 1) {
if (ctx->device->query_pool) {
ctx->device->device.destroyQueryPool(ctx->device->query_pool);
}
VkQueryPoolCreateInfo query_create_info = { VK_STRUCTURE_TYPE_QUERY_POOL_CREATE_INFO };
query_create_info.queryType = VK_QUERY_TYPE_TIMESTAMP;
query_create_info.queryCount = cgraph->n_nodes + 100;
ctx->device->query_pool = ctx->device->device.createQueryPool(query_create_info);
ctx->device->num_queries = query_create_info.queryCount;
}

ctx->device->device.resetQueryPool(ctx->device->query_pool, 0, cgraph->n_nodes+1);

GGML_ASSERT(ctx->compute_ctx.expired());
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, 0);
}

// Submit after enough work has accumulated, to overlap CPU cmdbuffer generation with GPU execution.
// Estimate the amount of matmul work by looking at the weight matrix size, and submit every 100MB
// (and scaled down based on model size, so smaller models submit earlier).
Expand Down Expand Up @@ -9531,6 +9555,17 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg

bool enqueued = ggml_vk_build_graph(ctx, cgraph->nodes[i], i, cgraph->nodes[submit_node_idx], submit_node_idx, false, i == last_node, almost_ready, submit);

if (vk_perf_logger_enabled) {
if (ctx->compute_ctx.expired()) {
compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
ctx->compute_ctx = compute_ctx;
ggml_vk_ctx_begin(ctx->device, compute_ctx);
} else {
compute_ctx = ctx->compute_ctx.lock();
}
compute_ctx->s->buffer.writeTimestamp(vk::PipelineStageFlagBits::eAllCommands, ctx->device->query_pool, i+1);
}

if (enqueued) {
++submitted_nodes;

Expand All @@ -9552,9 +9587,27 @@ static ggml_status ggml_backend_vk_graph_compute(ggml_backend_t backend, ggml_cg
}
}

#ifdef GGML_VULKAN_PERF
ctx->device->perf_logger->print_timings();
#endif
if (vk_perf_logger_enabled) {
// End the command buffer and submit/wait
GGML_ASSERT(!ctx->compute_ctx.expired());
compute_ctx = ctx->compute_ctx.lock();
ggml_vk_ctx_end(compute_ctx);

ggml_vk_submit(compute_ctx, ctx->device->fence);
VK_CHECK(ctx->device->device.waitForFences({ ctx->device->fence }, true, UINT64_MAX), "GGML_VULKAN_PERF waitForFences");
ctx->device->device.resetFences({ ctx->device->fence });

// Get the results and pass them to the logger
std::vector<uint64_t> timestamps(cgraph->n_nodes + 1);
ctx->device->device.getQueryPoolResults(ctx->device->query_pool, 0, cgraph->n_nodes + 1, (cgraph->n_nodes + 1)*sizeof(uint64_t), timestamps.data(), sizeof(uint64_t), vk::QueryResultFlagBits::e64 | vk::QueryResultFlagBits::eWait);
for (int i = 0; i < cgraph->n_nodes; i++) {
if (!ggml_vk_is_empty(cgraph->nodes[i])) {
ctx->device->perf_logger->log_timing(cgraph->nodes[i], uint64_t((timestamps[i+1] - timestamps[i]) * ctx->device->properties.limits.timestampPeriod));
}
}

ctx->device->perf_logger->print_timings();
}

ggml_vk_graph_cleanup(ctx);

Expand Down
Loading