Skip to content

Commit 1d656d6

Browse files
mqyggerganov
andauthored
ggml : change ggml_graph_compute() API to not require context (#1999)
* ggml_graph_compute: deprecate using ggml_context, try resolve issue #287 * rewrite: no longer consider backward compitability; plan and make_plan * minor: rename ctx as plan; const * remove ggml_graph_compute from tests/test-grad0.c, but current change breaks backward * add static ggml_graph_compute_sugar() * minor: update comments * reusable buffers * ggml : more consistent naming + metal fixes * ggml : fix docs * tests : disable grad / opt + minor naming changes * ggml : add ggml_graph_compute_with_ctx() - backwards compatible API - deduplicates a lot of copy-paste * ci : enable test-grad0 * examples : factor out plan allocation into a helper function * llama : factor out plan stuff into a helper function * ci : fix env * llama : fix duplicate symbols + refactor example benchmark * ggml : remove obsolete assert + refactor n_tasks section * ggml : fix indentation in switch * llama : avoid unnecessary bool * ggml : remove comments from source file and match order in header --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 7242140 commit 1d656d6

File tree

13 files changed

+531
-409
lines changed

13 files changed

+531
-409
lines changed

.github/workflows/build.yml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,9 @@ on:
1616
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu']
1717

1818
env:
19-
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
19+
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
20+
GGML_NLOOP: 3
21+
GGML_NITER: 1
2022

2123
jobs:
2224
ubuntu-focal-make:
@@ -64,7 +66,7 @@ jobs:
6466
id: cmake_test
6567
run: |
6668
cd build
67-
ctest --verbose
69+
ctest --verbose --timeout 900
6870
6971
ubuntu-latest-cmake-sanitizer:
7072
runs-on: ubuntu-latest
@@ -99,7 +101,7 @@ jobs:
99101
id: cmake_test
100102
run: |
101103
cd build
102-
ctest --verbose
104+
ctest --verbose --timeout 900
103105
104106
macOS-latest-make:
105107
runs-on: macos-latest
@@ -147,10 +149,11 @@ jobs:
147149
id: cmake_test
148150
run: |
149151
cd build
150-
ctest --verbose
152+
ctest --verbose --timeout 900
151153
152154
windows-latest-cmake:
153155
runs-on: windows-latest
156+
154157
env:
155158
OPENBLAS_VERSION: 0.3.23
156159
OPENCL_VERSION: 2023.04.17
@@ -249,7 +252,7 @@ jobs:
249252
if: ${{ matrix.build != 'clblast' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }} # Test AVX-512 only when possible
250253
run: |
251254
cd build
252-
ctest -C Release --verbose
255+
ctest -C Release --verbose --timeout 900
253256
254257
- name: Get commit hash
255258
id: commit

examples/baby-llama/baby-llama.cpp

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,17 @@ float frand_normal(struct random_normal_distribution * rnd) {
3131
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
3232
}
3333

34+
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
35+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
36+
37+
if (plan.work_size > 0) {
38+
buf.resize(plan.work_size);
39+
plan.work_data = buf.data();
40+
}
41+
42+
ggml_graph_compute(graph, &plan);
43+
}
44+
3445
struct ggml_tensor * randomize_tensor(
3546
struct ggml_tensor * tensor,
3647
int ndims,
@@ -1569,6 +1580,8 @@ int main(int argc, char ** argv) {
15691580
int n_tokens = model.hparams.n_ctx;
15701581
int n_vocab = model.hparams.n_vocab;
15711582

1583+
std::vector<uint8_t> work_buffer;
1584+
15721585
for (int ex=0; ex<n_examples; ++ex) {
15731586
struct ggml_init_params params = {
15741587
/*.mem_size =*/ compute_size,
@@ -1586,7 +1599,6 @@ int main(int argc, char ** argv) {
15861599
int n_past = 0;
15871600

15881601
ggml_cgraph gf = {};
1589-
gf.n_threads = 1;
15901602

15911603
get_example_targets_batch(ctx0, 64*ex+0, tokens_input, targets);
15921604

@@ -1595,7 +1607,7 @@ int main(int argc, char ** argv) {
15951607
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
15961608

15971609
ggml_build_forward_expand(&gf, e);
1598-
ggml_graph_compute(ctx0, &gf);
1610+
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
15991611

16001612
float error_before_opt = ggml_get_f32_1d(e, 0);
16011613

@@ -1611,7 +1623,7 @@ int main(int argc, char ** argv) {
16111623
ggml_opt(ctx0, opt_params_lbfgs, e);
16121624
//
16131625
ggml_build_forward_expand(&gf, e);
1614-
ggml_graph_compute(ctx0, &gf);
1626+
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
16151627

16161628
float error_after_opt = ggml_get_f32_1d(e, 0);
16171629

@@ -1659,13 +1671,12 @@ int main(int argc, char ** argv) {
16591671
struct ggml_context * ctx0 = ggml_init(params);
16601672

16611673
ggml_cgraph gf = {};
1662-
gf.n_threads = 1;
16631674

16641675
int n_past = 0;
16651676
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
16661677

16671678
ggml_build_forward_expand(&gf, logits);
1668-
ggml_graph_compute(ctx0, &gf);
1679+
ggml_graph_compute_helper(work_buffer, &gf, /*n_threads*/ 1);
16691680

16701681
struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
16711682
struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
@@ -1687,10 +1698,11 @@ int main(int argc, char ** argv) {
16871698
}
16881699

16891700
print_matrix(model.tok_embeddings);
1690-
16911701
printf("done\n");
1702+
16921703
// ggml_free(kv_self.ctx);
16931704
// ggml_free(model_lora.ctx);
16941705
ggml_free(model.ctx);
1706+
16951707
return 0;
16961708
}

examples/benchmark/benchmark-matmult.cpp

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,17 @@
2020
#pragma warning(disable: 4244 4267) // possible loss of data
2121
#endif
2222

23+
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
24+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
25+
26+
if (plan.work_size > 0) {
27+
buf.resize(plan.work_size);
28+
plan.work_data = buf.data();
29+
}
30+
31+
ggml_graph_compute(graph, &plan);
32+
}
33+
2334
float tensor_sum_elements(const ggml_tensor * tensor) {
2435
float sum = 0;
2536
if (tensor->type==GGML_TYPE_F32) {
@@ -159,13 +170,14 @@ int main(int argc, char ** argv) {
159170
// printf("Creating compute graph\n");
160171
struct ggml_cgraph gf = ggml_build_forward(m11xm2);
161172

162-
gf.n_threads=benchmark_params.n_threads;
163-
printf("cgraph->n_threads=%i\n",gf.n_threads);
173+
printf("n_threads=%i\n", benchmark_params.n_threads);
164174

165175
TENSOR_DUMP(m11);
166176
TENSOR_DUMP(m2);
167177

168-
ggml_graph_compute(ctx, &gf);
178+
std::vector<uint8_t> work_buffer;
179+
180+
ggml_graph_compute_helper(work_buffer, &gf, benchmark_params.n_threads);
169181

170182
TENSOR_DUMP(gf.nodes[0]);
171183

@@ -187,7 +199,6 @@ int main(int argc, char ** argv) {
187199

188200
// printf("Creating compute graph\n");
189201
struct ggml_cgraph gf31 = ggml_build_forward(q31);
190-
gf31.n_threads=benchmark_params.n_threads;
191202

192203
// Set up a second graph computation to make sure we override the CPU cache lines
193204
// printf("Creating new tensor q12 & Running quantize\n");
@@ -199,8 +210,7 @@ int main(int argc, char ** argv) {
199210

200211
//printf("Creating compute graph\n");
201212
struct ggml_cgraph gf32 = ggml_build_forward(q32);
202-
gf32.n_threads=benchmark_params.n_threads;
203-
printf("cgraph->n_threads=%i\n",gf31.n_threads);
213+
printf("n_threads=%i\n", benchmark_params.n_threads);
204214

205215
const int dimx = sizex;
206216
const int dimy = sizey;
@@ -221,14 +231,15 @@ int main(int argc, char ** argv) {
221231

222232
long long int start = ggml_time_us();
223233
//printf("Running ggml_graph_compute\n");
224-
ggml_graph_compute(ctx, &gf31);
234+
ggml_graph_compute_helper(work_buffer, &gf31, benchmark_params.n_threads);
235+
225236
long long int stop = ggml_time_us();
226237
long long int usec = stop-start;
227238
double gflops = (double)(flops_per_matrix)/usec/1000.0;
228239
gflops_sum += gflops;
229240
printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%10.2f\n",
230241
i,
231-
gf31.n_threads,
242+
benchmark_params.n_threads,
232243
sizex, sizey, sizez, flops_per_matrix,
233244
usec,gflops);
234245

@@ -253,7 +264,7 @@ int main(int argc, char ** argv) {
253264
}
254265

255266
// Running a different graph computation to make sure we override the CPU cache lines
256-
ggml_graph_compute(ctx, &gf32);
267+
ggml_graph_compute_helper(work_buffer, &gf32, benchmark_params.n_threads);
257268
}
258269
printf("\n");
259270
printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations));

examples/metal/metal.cpp

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,10 +35,9 @@ int main(int argc, char ** argv) {
3535
struct ggml_context * ctx_eval = NULL;
3636

3737
struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
38-
gf.n_threads = 1;
3938

4039
// this allocates all Metal resources and memory buffers
41-
auto * ctx_metal = ggml_metal_init();
40+
auto * ctx_metal = ggml_metal_init(1);
4241

4342
const size_t max_size_data = ggml_get_max_tensor_size(ctx_data);
4443
const size_t max_size_eval = ggml_get_max_tensor_size(ctx_eval);

examples/train-text-from-scratch/train-text-from-scratch.cpp

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,17 @@ float frand_uniform(struct random_uniform_distribution * rnd) {
6060
return rnd->rd(rnd->gen);
6161
}
6262

63+
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
64+
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
65+
66+
if (plan.work_size > 0) {
67+
buf.resize(plan.work_size);
68+
plan.work_data = buf.data();
69+
}
70+
71+
ggml_graph_compute(graph, &plan);
72+
}
73+
6374
struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) {
6475
float scale = 1.0f; // xavier
6576
switch (tensor->n_dims) {
@@ -1426,11 +1437,9 @@ struct ggml_tensor * forward_batch_wo_cache_flash_attn_train(
14261437

14271438
gf->n_nodes = 0;
14281439
gf->n_leafs = 0;
1429-
gf->work_size = 0;
14301440
gf->perf_runs = 0;
14311441
gf->perf_cycles = 0;
14321442
gf->perf_time_us = 0;
1433-
gf->work = NULL;
14341443

14351444
const auto & hparams = model->hparams;
14361445
//const int n_ctx = hparams.n_ctx;
@@ -3162,6 +3171,7 @@ int main(int argc, char ** argv) {
31623171
printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx));
31633172
// ggml_print_tensor_objects(model.ctx);
31643173

3174+
// TODO: use std::vector<uint8_t> intead of "new"
31653175
size_t compute_size = 1024ll*1024ll*1024ll*((size_t) params.mem_compute_gb);
31663176
uint8_t * compute_addr = new uint8_t[compute_size];
31673177

@@ -3183,6 +3193,8 @@ int main(int argc, char ** argv) {
31833193
GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size());
31843194
}
31853195

3196+
std::vector<uint8_t> work_buffer;
3197+
31863198
printf("%s: begin training\n", __func__);
31873199

31883200
for (int ex = 0; ex < params.n_examples; ++ex) {
@@ -3217,9 +3229,6 @@ int main(int argc, char ** argv) {
32173229
struct ggml_cgraph * gf = (struct ggml_cgraph *) gfbuf->data;
32183230
struct ggml_cgraph * gb = (struct ggml_cgraph *) gbbuf->data;
32193231

3220-
// ggml_cgraph gf = {};
3221-
gf->n_threads = params.n_threads;
3222-
gb->n_threads = params.n_threads;
32233232

32243233
get_example_targets_batch(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, target_logits, target_probs);
32253234

@@ -3248,7 +3257,7 @@ int main(int argc, char ** argv) {
32483257
*gb = ggml_build_backward(ctx0, gf, true);
32493258
}
32503259

3251-
ggml_graph_compute(ctx0, gf);
3260+
ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
32523261

32533262
size_t used_mem_before_opt = ggml_used_mem(ctx0);
32543263

@@ -3272,7 +3281,7 @@ int main(int argc, char ** argv) {
32723281
model.train_samples += n_batch;
32733282
model.train_tokens += n_batch * n_tokens;
32743283

3275-
ggml_graph_compute(ctx0, gf);
3284+
ggml_graph_compute_helper(work_buffer, gf, params.n_threads);
32763285

32773286
float error_after_opt = ggml_get_f32_1d(loss, 0);
32783287

@@ -3354,13 +3363,12 @@ int main(int argc, char ** argv) {
33543363
struct ggml_context * ctx0 = ggml_init(cparams);
33553364

33563365
ggml_cgraph gf = {};
3357-
gf.n_threads = params.n_threads;
33583366

33593367
int n_past = 0;
33603368
struct ggml_tensor * logits = forward(&model, &kv_self, ctx0, &gf, tokens_input, sample_ctx, n_past);
33613369

33623370
ggml_build_forward_expand(&gf, logits);
3363-
ggml_graph_compute(ctx0, &gf);
3371+
ggml_graph_compute_helper(work_buffer, &gf, params.n_threads);
33643372

33653373
//struct ggml_tensor * best_samples = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, sample_ctx);
33663374
//struct ggml_tensor * probs = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_vocab, sample_ctx);
@@ -3386,6 +3394,7 @@ int main(int argc, char ** argv) {
33863394
delete[] compute_addr;
33873395
delete[] compute_buf_0;
33883396
delete[] compute_buf_1;
3397+
33893398
llama_free(lctx);
33903399
llama_free_model(lmodel);
33913400
ggml_free(model.ctx);

ggml-metal.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,13 @@ extern "C" {
3434

3535
struct ggml_metal_context;
3636

37-
struct ggml_metal_context * ggml_metal_init(void);
37+
// number of command buffers to use
38+
struct ggml_metal_context * ggml_metal_init(int n_cb);
3839
void ggml_metal_free(struct ggml_metal_context * ctx);
3940

41+
// set the number of command buffers to use
42+
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
43+
4044
// creates a mapping between a host memory buffer and a device memory buffer
4145
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
4246
// - the mapping is used during computation to determine the arguments of the compute kernels

ggml-metal.m

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
};
2626

2727
struct ggml_metal_context {
28+
int n_cb;
29+
2830
float * logits;
2931

3032
id<MTLDevice> device;
@@ -86,11 +88,12 @@ @interface GGMLMetalClass : NSObject
8688
@implementation GGMLMetalClass
8789
@end
8890

89-
struct ggml_metal_context * ggml_metal_init(void) {
91+
struct ggml_metal_context * ggml_metal_init(int n_cb) {
9092
fprintf(stderr, "%s: allocating\n", __func__);
9193

9294
struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
9395

96+
ctx->n_cb = n_cb;
9497
ctx->device = MTLCreateSystemDefaultDevice();
9598
ctx->queue = [ctx->device newCommandQueue];
9699
ctx->n_buffers = 0;
@@ -208,6 +211,10 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
208211
free(ctx);
209212
}
210213

214+
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
215+
ctx->n_cb = n_cb;
216+
}
217+
211218
// finds the Metal buffer that contains the tensor data on the GPU device
212219
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
213220
// Metal buffer based on the host memory pointer
@@ -354,7 +361,7 @@ void ggml_metal_graph_compute(
354361
// create multiple command buffers and enqueue them
355362
// then, we encode the graph into the command buffers in parallel
356363

357-
const int n_cb = gf->n_threads;
364+
const int n_cb = ctx->n_cb;
358365

359366
NSMutableArray * command_buffers = [NSMutableArray arrayWithCapacity:n_cb];
360367

0 commit comments

Comments
 (0)