Skip to content

Commit 11cb93a

Browse files
committed
vulkan : add backend registry / device interfaces
1 parent c83ad6d commit 11cb93a

File tree

4 files changed

+161
-42
lines changed

4 files changed

+161
-42
lines changed

ggml/include/ggml-vulkan.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
2424
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
2525
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
2626

27+
GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
28+
2729
#ifdef __cplusplus
2830
}
2931
#endif

ggml/src/ggml-backend.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,10 @@ void ggml_backend_reg_set_log_callback(ggml_backend_reg_t reg, ggml_log_callback
517517
#include "ggml-cuda.h"
518518
#endif
519519

520+
#ifdef GGML_USE_VULKAN
521+
#include "ggml-vulkan.h"
522+
#endif
523+
520524
struct ggml_backend_registry {
521525
std::vector<ggml_backend_reg_t> backends;
522526
std::vector<ggml_backend_dev_t> devices;
@@ -526,6 +530,10 @@ struct ggml_backend_registry {
526530
register_backend(ggml_backend_cuda_reg());
527531
#endif
528532

533+
#ifdef GGML_USE_VULKAN
534+
register_backend(ggml_backend_vk_reg());
535+
#endif
536+
529537
register_backend(ggml_backend_cpu_reg());
530538

531539
// TODO: sycl, metal, vulkan, kompute, cann

ggml/src/ggml-vulkan.cpp

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6779,6 +6779,155 @@ void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total
67796779
}
67806780
}
67816781

6782+
//////////////////////////
6783+
6784+
struct ggml_backend_vk_device_context {
6785+
int device;
6786+
std::string name;
6787+
std::string description;
6788+
};
6789+
6790+
static const char * ggml_backend_vk_device_get_name(ggml_backend_dev_t dev) {
6791+
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
6792+
return ctx->name.c_str();
6793+
}
6794+
6795+
static const char * ggml_backend_vk_device_get_description(ggml_backend_dev_t dev) {
6796+
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
6797+
return ctx->description.c_str();
6798+
}
6799+
6800+
static void ggml_backend_vk_device_get_memory(ggml_backend_dev_t device, size_t * free, size_t * total) {
6801+
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)device->context;
6802+
ggml_backend_vk_get_device_memory(ctx->device, free, total);
6803+
}
6804+
6805+
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_buffer_type(ggml_backend_dev_t dev) {
6806+
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
6807+
return ggml_backend_vk_buffer_type(ctx->device);
6808+
}
6809+
6810+
static ggml_backend_buffer_type_t ggml_backend_vk_device_get_host_buffer_type(ggml_backend_dev_t dev) {
6811+
UNUSED(dev);
6812+
return ggml_backend_vk_host_buffer_type();
6813+
}
6814+
6815+
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
6816+
UNUSED(dev);
6817+
return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
6818+
}
6819+
6820+
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
6821+
props->name = ggml_backend_vk_device_get_name(dev);
6822+
props->description = ggml_backend_vk_device_get_description(dev);
6823+
props->type = ggml_backend_vk_device_get_type(dev);
6824+
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
6825+
props->caps = {
6826+
/* async */ false,
6827+
/* host_buffer */ true,
6828+
/* events */ false,
6829+
};
6830+
}
6831+
6832+
static ggml_backend_t ggml_backend_vk_device_init(ggml_backend_dev_t dev, const char * params) {
6833+
UNUSED(params);
6834+
ggml_backend_vk_device_context * ctx = (ggml_backend_vk_device_context *)dev->context;
6835+
return ggml_backend_vk_init(ctx->device);
6836+
}
6837+
6838+
static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
6839+
// TODO: move here
6840+
UNUSED(dev);
6841+
return ggml_backend_vk_supports_op(nullptr, op);
6842+
}
6843+
6844+
static bool ggml_backend_vk_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
6845+
// TODO: move here
6846+
UNUSED(dev);
6847+
return ggml_backend_vk_supports_buft(nullptr, buft);
6848+
}
6849+
6850+
static bool ggml_backend_vk_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
6851+
// TODO: move here
6852+
UNUSED(dev);
6853+
return ggml_backend_vk_offload_op(nullptr, op);
6854+
}
6855+
6856+
static const struct ggml_backend_device_i ggml_backend_vk_device_i = {
6857+
/* .get_name = */ ggml_backend_vk_device_get_name,
6858+
/* .get_description = */ ggml_backend_vk_device_get_description,
6859+
/* .get_memory = */ ggml_backend_vk_device_get_memory,
6860+
/* .get_type = */ ggml_backend_vk_device_get_type,
6861+
/* .get_props = */ ggml_backend_vk_device_get_props,
6862+
/* .init_backend = */ ggml_backend_vk_device_init,
6863+
/* .get_buffer_type = */ ggml_backend_vk_device_get_buffer_type,
6864+
/* .get_host_buffer_type = */ ggml_backend_vk_device_get_host_buffer_type,
6865+
/* .buffer_from_host_ptr = */ NULL,
6866+
/* .supports_op = */ ggml_backend_vk_device_supports_op,
6867+
/* .supports_buft = */ ggml_backend_vk_device_supports_buft,
6868+
/* .offload_op = */ ggml_backend_vk_device_offload_op,
6869+
/* .event_new = */ NULL,
6870+
/* .event_free = */ NULL,
6871+
/* .event_synchronize = */ NULL,
6872+
};
6873+
6874+
static const char * ggml_backend_vk_reg_get_name(ggml_backend_reg_t reg) {
6875+
UNUSED(reg);
6876+
return GGML_VK_NAME;
6877+
}
6878+
6879+
static size_t ggml_backend_vk_reg_get_device_count(ggml_backend_reg_t reg) {
6880+
UNUSED(reg);
6881+
return ggml_backend_vk_get_device_count();
6882+
}
6883+
6884+
static ggml_backend_dev_t ggml_backend_vk_reg_get_device(ggml_backend_reg_t reg, size_t device) {
6885+
static std::vector<ggml_backend_dev_t> devices;
6886+
6887+
static bool initialized = false;
6888+
6889+
{
6890+
static std::mutex mutex;
6891+
std::lock_guard<std::mutex> lock(mutex);
6892+
if (!initialized) {
6893+
for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
6894+
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
6895+
ctx->device = i;
6896+
char desc[256];
6897+
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
6898+
ctx->name = GGML_VK_NAME + std::to_string(i);
6899+
ctx->description = desc;
6900+
devices.push_back(new ggml_backend_device {
6901+
/* .iface = */ ggml_backend_vk_device_i,
6902+
/* .reg = */ reg,
6903+
/* .context = */ ctx,
6904+
});
6905+
}
6906+
initialized = true;
6907+
}
6908+
}
6909+
6910+
GGML_ASSERT(device < devices.size());
6911+
return devices[device];
6912+
}
6913+
6914+
static const struct ggml_backend_reg_i ggml_backend_vk_reg_i = {
6915+
/* .get_name = */ ggml_backend_vk_reg_get_name,
6916+
/* .get_device_count = */ ggml_backend_vk_reg_get_device_count,
6917+
/* .get_device = */ ggml_backend_vk_reg_get_device,
6918+
/* .get_proc_address = */ NULL,
6919+
/* .set_log_callback = */ NULL,
6920+
};
6921+
6922+
ggml_backend_reg_t ggml_backend_vk_reg() {
6923+
static ggml_backend_reg reg = {
6924+
/* .iface = */ ggml_backend_vk_reg_i,
6925+
/* .context = */ nullptr,
6926+
};
6927+
6928+
return &reg;
6929+
}
6930+
67826931
// Extension availability
67836932
static bool ggml_vk_instance_validation_ext_available(const std::vector<vk::ExtensionProperties>& instance_extensions) {
67846933
#ifdef GGML_VULKAN_VALIDATE

src/llama.cpp

Lines changed: 2 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,7 @@
1212
# include "ggml-rpc.h"
1313
#endif
1414

15-
#if defined(GGML_USE_VULKAN)
16-
# include "ggml-vulkan.h"
17-
#elif defined(GGML_USE_SYCL)
15+
#if defined(GGML_USE_SYCL)
1816
# include "ggml-sycl.h"
1917
#elif defined(GGML_USE_KOMPUTE)
2018
# include "ggml-kompute.h"
@@ -3429,8 +3427,6 @@ static int llama_get_device_count(const llama_model & model) {
34293427
count += 1;
34303428
#elif defined(GGML_USE_SYCL)
34313429
count += ggml_backend_sycl_get_device_count();
3432-
#elif defined(GGML_USE_VULKAN)
3433-
count += ggml_backend_vk_get_device_count();
34343430
#elif defined(GGML_USE_CANN)
34353431
count += ggml_backend_cann_get_device_count();
34363432
#endif
@@ -3462,10 +3458,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(const llama_mode
34623458
}
34633459
#elif defined(GGML_USE_CPU_HBM)
34643460
buft = ggml_backend_cpu_hbm_buffer_type();
3465-
#elif defined(GGML_USE_VULKAN)
3466-
if (host_buffer) {
3467-
buft = ggml_backend_vk_host_buffer_type();
3468-
}
34693461
#endif
34703462

34713463
if (buft == nullptr) {
@@ -3495,8 +3487,6 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_
34953487

34963488
#if defined(GGML_USE_METAL)
34973489
buft = ggml_backend_metal_buffer_type();
3498-
#elif defined(GGML_USE_VULKAN)
3499-
buft = ggml_backend_vk_buffer_type(device);
35003490
#elif defined(GGML_USE_SYCL)
35013491
buft = ggml_backend_sycl_buffer_type(device);
35023492
#elif defined(GGML_USE_KOMPUTE)
@@ -3569,11 +3559,6 @@ static size_t llama_get_device_memory(const llama_model & model, int device) {
35693559
size_t free;
35703560
ggml_backend_sycl_get_device_memory(device, &free, &total);
35713561
return free;
3572-
#elif defined(GGML_USE_VULKAN)
3573-
size_t total;
3574-
size_t free;
3575-
ggml_backend_vk_get_device_memory(device, &free, &total);
3576-
return free;
35773562
#elif defined(GGML_USE_CANN)
35783563
size_t total;
35793564
size_t free;
@@ -19046,7 +19031,7 @@ bool llama_supports_mlock(void) {
1904619031
}
1904719032

1904819033
bool llama_supports_gpu_offload(void) {
19049-
#if defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || \
19034+
#if defined(GGML_USE_METAL) || \
1905019035
defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
1905119036
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
1905219037
return true;
@@ -19359,31 +19344,6 @@ struct llama_context * llama_new_context_with_model(
1935919344
}
1936019345
ctx->backends.push_back(ctx->backend_metal);
1936119346
}
19362-
#elif defined(GGML_USE_VULKAN)
19363-
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
19364-
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);
19365-
llama_free(ctx);
19366-
return nullptr;
19367-
}
19368-
if (model->split_mode == LLAMA_SPLIT_MODE_NONE) {
19369-
ggml_backend_t backend = ggml_backend_vk_init(main_gpu);
19370-
if (backend == nullptr) {
19371-
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan backend\n", __func__);
19372-
llama_free(ctx);
19373-
return nullptr;
19374-
}
19375-
ctx->backends.push_back(backend);
19376-
} else {
19377-
for (int device = 0; device < ggml_backend_vk_get_device_count(); ++device) {
19378-
ggml_backend_t backend = ggml_backend_vk_init(device);
19379-
if (backend == nullptr) {
19380-
LLAMA_LOG_ERROR("%s: failed to initialize Vulkan%d backend\n", __func__, device);
19381-
llama_free(ctx);
19382-
return nullptr;
19383-
}
19384-
ctx->backends.push_back(backend);
19385-
}
19386-
}
1938719347
#elif defined(GGML_USE_SYCL)
1938819348
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
1938919349
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {

0 commit comments

Comments
 (0)