Skip to content

Commit 028d3f7

Browse files
author
Lorenzo Toniazzi
committed
Metal running (still buffer issues)
1 parent 5c4ba81 commit 028d3f7

File tree

3 files changed

+414
-10
lines changed

3 files changed

+414
-10
lines changed

BRANCH_SETUP.md

Lines changed: 253 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ Run main with base model and lora adapter to hot-swap
3636
-n 128
3737
```
3838

39+
Working but `ggml_metal_get_buffer: error: tensor 'blk.16.attn_v.weight.loraB' buffer is nil`
40+
3941
With `ngl > 0` the code breaks. Probably because the Lora tensors try to interact with the base tensors (as in `lora_mul_mat`), but the lora tensors are not moved to the gpu buffer of the base tensors.
4042

4143
# Logic
@@ -47,4 +49,254 @@ With `ngl > 0` the code breaks. Probably because the Lora tensors try to interac
4749

4850
- Only one Lora adapter can be passed.
4951
- Applying only adapter to Q, K, V matrices to keep the code contained (fintuning trained lora tensors for all linear layers)
50-
- GPU not supported
52+
- GPU not supported
53+
54+
55+
56+
57+
# Tutorial
58+
59+
```cpp
60+
#include "llama.h"
61+
62+
#include "unicode.h"
63+
64+
#include "ggml.h"
65+
#include "ggml-alloc.h"
66+
#include "ggml-backend.h"
67+
68+
#ifdef GGML_USE_RPC
69+
# include "ggml-rpc.h"
70+
#endif
71+
72+
#ifdef GGML_USE_CUDA
73+
# include "ggml-cuda.h"
74+
#elif defined(GGML_USE_VULKAN)
75+
# include "ggml-vulkan.h"
76+
#elif defined(GGML_USE_SYCL)
77+
# include "ggml-sycl.h"
78+
#elif defined(GGML_USE_KOMPUTE)
79+
# include "ggml-kompute.h"
80+
#endif
81+
82+
#ifdef GGML_USE_METAL
83+
# include "ggml-metal.h"
84+
#endif
85+
86+
// TODO: replace with ggml API call
87+
#define QK_K 256
88+
89+
#ifdef __has_include
90+
#if __has_include(<unistd.h>)
91+
#include <unistd.h>
92+
#if defined(_POSIX_MAPPED_FILES)
93+
#include <sys/mman.h>
94+
#include <fcntl.h>
95+
#endif
96+
#if defined(_POSIX_MEMLOCK_RANGE)
97+
#include <sys/resource.h>
98+
#endif
99+
#endif
100+
#endif
101+
102+
#if defined(_WIN32)
103+
#define WIN32_LEAN_AND_MEAN
104+
#ifndef NOMINMAX
105+
#define NOMINMAX
106+
#endif
107+
#include <windows.h>
108+
#ifndef PATH_MAX
109+
#define PATH_MAX MAX_PATH
110+
#endif
111+
#include <io.h>
112+
#endif
113+
114+
#include <algorithm>
115+
#include <array>
116+
#include <cassert>
117+
#include <cctype>
118+
#include <cfloat>
119+
#include <cinttypes>
120+
#include <climits>
121+
#include <cmath>
122+
#include <cstdarg>
123+
#include <cstddef>
124+
#include <cstdint>
125+
#include <cstdio>
126+
#include <cstring>
127+
#include <ctime>
128+
#include <forward_list>
129+
#include <fstream>
130+
#include <functional>
131+
#include <future>
132+
#include <initializer_list>
133+
#include <locale>
134+
#include <map>
135+
#include <memory>
136+
#include <mutex>
137+
#include <numeric>
138+
#include <queue>
139+
#include <random>
140+
#include <regex>
141+
#include <set>
142+
#include <sstream>
143+
#include <thread>
144+
#include <type_traits>
145+
#include <unordered_map>
146+
#include "ggml-metal.h"
147+
148+
#if defined(_MSC_VER)
149+
#pragma warning(disable: 4244 4267) // possible loss of data
150+
#endif
151+
152+
#ifdef __GNUC__
153+
#ifdef __MINGW32__
154+
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
155+
#else
156+
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
157+
#endif
158+
#else
159+
#define LLAMA_ATTRIBUTE_FORMAT(...)
160+
#endif
161+
162+
#define LLAMA_MAX_NODES 8192
163+
#define LLAMA_MAX_EXPERTS 160
164+
165+
166+
int main() {
167+
struct ggml_init_params params = {
168+
.mem_size = 16*1024*1024,
169+
.mem_buffer = NULL,
170+
/*.no_alloc =*/ true,
171+
};
172+
173+
// The library allows the user to define a certain function using the available tensor operations. This function
174+
// definition is represented internally via a computation graph. Each tensor operation in the function definition
175+
// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
176+
// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
177+
// using one of the available optimization algorithms.
178+
//
179+
// For example, here we define the function: f(x) = a*x^2 + b
180+
181+
// memory allocation happens here
182+
// Create context allogating memory
183+
struct ggml_context * ctx = ggml_init(params);
184+
185+
struct ggml_tensor * x = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
186+
187+
ggml_set_param(ctx, x); // x is an input variable
188+
189+
struct ggml_tensor * a = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
190+
struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 1);
191+
struct ggml_tensor * x2 = ggml_mul(ctx, x, x);
192+
struct ggml_tensor * f = ggml_add(ctx, ggml_mul(ctx, a, x2), b);
193+
194+
struct ggml_cgraph * gf = ggml_new_graph(ctx);
195+
196+
// ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
197+
// ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
198+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
199+
if (buf == nullptr) {
200+
throw std::runtime_error("unable to allocate backend buffer");
201+
}
202+
ggml_used_mem(ctx);
203+
204+
// llama_default_buffer_type_offload(model, layer_gpu); used in llama.cpp
205+
// How to check which buffer is the context allocated,
206+
// can look at single tensors? option, check in inited in base model
207+
208+
// Try this
209+
// You can simplify all of this for testing, and if you are using CPU only, and just run with -ngl 0
210+
// and allocate everything in a CPU buffer by using
211+
// ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cpu_buffer_type());
212+
// or run with -ngl 99 and use a Metal buffer type instead with
213+
// ggml_backend_metal_buffer_type()
214+
// It will still run if you allocate the tensors in the wrong buffer type as long as you use ggml-backend
215+
// to allocate the tensors, it will just be slower.
216+
217+
// Notice that the function definition above does not involve any actual computation. The computation is performed only
218+
// when the user explicitly requests it. For example, to compute the function's value at x = 2.0:
219+
220+
221+
ggml_build_forward_expand(gf, f);
222+
223+
// set the input variable and parameter values
224+
ggml_set_f32(x, 2.0f);
225+
ggml_set_f32(a, 3.0f);
226+
ggml_set_f32(b, 4.0f);
227+
228+
ggml_graph_compute_with_ctx(ctx, gf, 1);
229+
230+
printf("f = %f\n", ggml_get_f32_1d(f, 0));
231+
232+
// The actual computation is performed in the ggml_graph_compute() function.
233+
//
234+
// The ggml_new_tensor_...() functions create new tensors. They are allocated in the memory buffer provided to the
235+
// ggml_init() function. You have to be careful not to exceed the memory buffer size. Therefore, you have to know
236+
// in advance how much memory you need for your computation. Alternatively, you can allocate a large enough memory
237+
// and after defining the computation graph, call the ggml_used_mem() function to find out how much memory was
238+
// actually needed.
239+
//
240+
// The ggml_set_param() function marks a tensor as an input variable. This is used by the automatic
241+
// differentiation and optimization algorithms.
242+
//
243+
// The described approach allows to define the function graph once and then compute its forward or backward graphs
244+
// multiple times. All computations will use the same memory buffer allocated in the ggml_init() function. This way
245+
// the user can avoid the memory allocation overhead at runtime.
246+
//
247+
// The library supports multi-dimensional tensors - up to 4 dimensions. The FP16 and FP32 data types are first class
248+
// citizens, but in theory the library can be extended to support FP8 and integer data types.
249+
//
250+
// Each tensor operation produces a new tensor. Initially the library was envisioned to support only the use of unary
251+
// and binary operations. Most of the available operations fall into one of these two categories. With time, it became
252+
// clear that the library needs to support more complex operations. The way to support these operations is not clear
253+
// yet, but a few examples are demonstrated in the following operations:
254+
//
255+
// - ggml_permute()
256+
// - ggml_conv_1d_1s()
257+
// - ggml_conv_1d_2s()
258+
//
259+
// For each tensor operator, the library implements a forward and backward computation function. The forward function
260+
// computes the output tensor value given the input tensor values. The backward function computes the adjoint of the
261+
// input tensors given the adjoint of the output tensor. For a detailed explanation of what this means, take a
262+
// calculus class, or watch the following video:
263+
//
264+
// What is Automatic Differentiation?
265+
// https://www.youtube.com/watch?v=wG_nF1awSSY
266+
267+
// ## Tensor data (struct ggml_tensor)
268+
//
269+
// The tensors are stored in memory via the ggml_tensor struct. The structure provides information about the size of
270+
// the tensor, the data type, and the memory buffer where the tensor data is stored. Additionally, it contains
271+
// pointers to the "source" tensors - i.e. the tensors that were used to compute the current tensor. For example:
272+
273+
struct ggml_tensor * c = ggml_add(ctx, a, b);
274+
275+
assert(c->src[0] == a);
276+
assert(c->src[1] == b);
277+
278+
// The multi-dimensional tensors are stored in row-major order. The ggml_tensor struct contains fields for the
279+
// number of elements in each dimension ("ne") as well as the number of bytes ("nb", a.k.a. stride). This allows
280+
// to store tensors that are not contiguous in memory, which is useful for operations such as transposition and
281+
// permutation. All tensor operations have to take the stride into account and not assume that the tensor is
282+
// contiguous in memory.
283+
284+
// The data of the tensor is accessed via the "data" pointer. For example:
285+
286+
const int nx = 2;
287+
const int ny = 3;
288+
289+
struct ggml_tensor * A = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
290+
291+
for (int y = 0; y < ny; y++) {
292+
for (int x = 0; x < nx; x++) {
293+
*(float *) ((char *) A->data + y*A->nb[1] + x*A->nb[0]) = x + y;
294+
}
295+
}
296+
297+
//
298+
// Alternatively, there are helper functions, such as ggml_get_f32_1d() and ggml_set_f32_1d() that can be used.
299+
//
300+
301+
}
302+
```

examples/main/main.cpp

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,74 @@ static void llama_log_callback_logTee(ggml_log_level level, const char * text, v
117117
LOG_TEE("%s", text);
118118
}
119119

120+
#include "ggml-metal.h"
121+
122+
bool is_pointer_in_buffer_range(void *ptr, void *buffer_start, size_t buffer_size) {
123+
return (ptr >= (char*)buffer_start) && (ptr < ((char*)buffer_start + buffer_size));
124+
}
125+
126+
127+
void verify_tensor_allocation(struct ggml_context * ctx, ggml_backend_buffer_t buffer, size_t buffer_size) {
128+
struct ggml_tensor * first = ggml_get_first_tensor(ctx);
129+
for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
130+
if (t->data != NULL) {
131+
if (!is_pointer_in_buffer_range(t->data, buffer, buffer_size)) {
132+
fprintf(stderr, "Tensor %s is not within the allocated buffer range.\n", t->name);
133+
} else {
134+
printf("Tensor %s is correctly allocated in the buffer.\n", t->name);
135+
}
136+
}
137+
}
138+
}
139+
120140
int main(int argc, char ** argv) {
141+
142+
143+
// The library allows the user to define a certain function using the available tensor operations. This function
144+
// definition is represented internally via a computation graph. Each tensor operation in the function definition
145+
// corresponds to a node in the graph. Having the computation graph defined, the user can choose to compute the
146+
// function's value and/or its gradient with respect to the input variables. Optionally, the function can be optimized
147+
// using one of the available optimization algorithms.
148+
//
149+
// For example, here we define the function: f(x) = a*x^2 + b
150+
151+
// memory allocation happens here
152+
// Create context allogating memory
153+
struct ggml_init_params _params = {
154+
.mem_size = 16*1024*1024,
155+
.mem_buffer = NULL,
156+
.no_alloc = true,
157+
};
158+
struct ggml_context * _ctx = ggml_init(_params);
159+
160+
struct ggml_tensor * x = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
161+
162+
// ggml_set_param(_ctx, x); // x is an input variable
163+
164+
// struct ggml_tensor * a = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
165+
// struct ggml_tensor * b = ggml_new_tensor_1d(_ctx, GGML_TYPE_F32, 1);
166+
// struct ggml_tensor * x2 = ggml_mul(_ctx, x, x);
167+
// struct ggml_tensor * f = ggml_add(_ctx, ggml_mul(_ctx, a, x2), b);
168+
169+
// struct ggml_cgraph * gf = ggml_new_graph(_ctx);
170+
171+
// // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_cpu_buffer_type());
172+
// // ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type());
173+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(_ctx, ggml_backend_metal_buffer_type());
174+
if (buf == nullptr) {
175+
throw std::runtime_error("unable to allocate backend buffer");
176+
}
177+
else {
178+
size_t buffer_size = ggml_backend_buft_get_max_size(ggml_backend_metal_buffer_type());
179+
180+
// Verify tensor allocations
181+
verify_tensor_allocation(_ctx, buf, buffer_size);
182+
}
183+
ggml_used_mem(_ctx);
184+
//
185+
186+
187+
121188
gpt_params params;
122189
g_params = &params;
123190

0 commit comments

Comments
 (0)