kompute : fix ggml_vk_allocate failure control flow

cebtenzzre · cebtenzzre · commit d17540ba567c · 2024-03-04T16:39:42.000-05:00
The correct way to indicate an OOM condition is for alloc_buffer to
return NULL. This fixes undefined behavior caused by passing an
exception over the C boundary.

The rest of the changes help fix VRAM leaks in GPT4All when model
loading fails on GPU.

Signed-off-by: Jared Van Bortel &lt;jared@nomic.ai&gt;
diff --git a/ggml-backend.h b/ggml-backend.h
@@ -3,6 +3,9 @@
 #include "ggml.h"
 #include "ggml-alloc.h"
 
+#include <stdbool.h>
+#include <stddef.h>
+
 #ifdef  __cplusplus
 extern "C" {
 #endif
diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp
@@ -487,7 +487,7 @@ vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, v
     vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory);
     if (r != vk::Result::eSuccess) {
         std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl;
-        throw std::runtime_error("Error allocating vulkan memory.");
+        return nullptr;
     }
     return vkDeviceMemory;
 }
@@ -509,9 +509,13 @@ static ggml_vk_memory ggml_vk_allocate(size_t size) {
     bool isHostVisible = false;
     {
         memory.primaryBuffer = ggml_vk_allocate_buffer(size);
+
         vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer);
         vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal;
         memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible);
+        if (!memory.primaryMemory)
+            return {};
+
         komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0);
         if (isHostVisible) {
             vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data);
@@ -1906,6 +1910,10 @@ static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffe
 static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
     ggml_backend_kompute_device_ref(buft);
     auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size));
+    if (!ctx->primaryMemory) {
+        delete ctx;
+        return nullptr; // allocation failed
+    }
     return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size);
 }
 
diff --git a/llama.cpp b/llama.cpp
@@ -12127,7 +12127,7 @@ void llama_free_model(struct llama_model * model) {
     delete model;
 }
 
-static struct llama_context * llama_new_context_with_model_internal(
+struct llama_context * llama_new_context_with_model(
                  struct llama_model * model,
         struct llama_context_params   params) {
 
@@ -12426,18 +12426,6 @@ static struct llama_context * llama_new_context_with_model_internal(
     return ctx;
 }
 
-struct llama_context * llama_new_context_with_model(
-    struct llama_model * model,
-    struct llama_context_params params
-) {
-    try {
-        return llama_new_context_with_model_internal(model, params);
-    } catch (const std::exception & err) {
-        LLAMA_LOG_ERROR("%s: failed to init context: %s\n", __func__, err.what());
-        return nullptr;
-    }
-}
-
 void llama_free(struct llama_context * ctx) {
     delete ctx;
 }