Skip to content

Commit b804b1e

Browse files
phymbertslarenggerganov
authored
eval-callback: Example how to use eval callback for debugging (#6576)
* gguf-debug: Example how to use ggml callback for debugging * gguf-debug: no mutex, verify type, fix stride. * llama: cv eval: move cb eval field in common gpt_params * ggml_debug: use common gpt_params to pass cb eval. Fix get tensor SIGV random. * ggml_debug: ci: add tests * ggml_debug: EOL in CMakeLists.txt * ggml_debug: Remove unused param n_batch, no batching here * ggml_debug: fix trailing spaces * ggml_debug: fix trailing spaces * common: fix cb_eval and user data not initialized * ci: build revert label * ggml_debug: add main test label * doc: add a model: add a link to ggml-debug * ggml-debug: add to make toolchain * ggml-debug: tests add the main label * ggml-debug: ci add test curl label * common: allow the warmup to be disabled in llama_init_from_gpt_params * ci: add curl test * ggml-debug: better tensor type support * gitignore : ggml-debug * ggml-debug: printing also the sum of each tensor * ggml-debug: remove block size * eval-callback: renamed from ggml-debug * eval-callback: fix make toolchain --------- Co-authored-by: slaren <[email protected]> Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 8228b66 commit b804b1e

File tree

12 files changed

+320
-23
lines changed

12 files changed

+320
-23
lines changed

.github/workflows/build.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ jobs:
5252
id: cmake_test
5353
run: |
5454
cd build
55-
ctest -L main --verbose --timeout 900
55+
ctest -L 'main|curl' --verbose --timeout 900
5656
5757
- name: Determine tag name
5858
id: tag
@@ -209,21 +209,21 @@ jobs:
209209
id: depends
210210
run: |
211211
sudo apt-get update
212-
sudo apt-get install build-essential
212+
sudo apt-get install build-essential libcurl4-openssl-dev
213213
214214
- name: Build
215215
id: cmake_build
216216
run: |
217217
mkdir build
218218
cd build
219-
cmake .. -DLLAMA_FATAL_WARNINGS=ON
219+
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON
220220
cmake --build . --config Release -j $(nproc)
221221
222222
- name: Test
223223
id: cmake_test
224224
run: |
225225
cd build
226-
ctest -L main --verbose --timeout 900
226+
ctest -L 'main|curl' --verbose --timeout 900
227227
228228
- name: Test llama2c conversion
229229
id: llama2c_test

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@ models-mnt
4848
/convert-llama2c-to-ggml
4949
/embd-input-test
5050
/embedding
51+
/eval-callback
5152
/gguf
5253
/gguf-llama-simple
5354
/gguf-split

Makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
# Define the default target now so that it is always the first target
22
BUILD_TARGETS = \
33
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
4-
simple batched batched-bench save-load-state server gguf gguf-split llama-bench libllava.a llava-cli baby-llama beam-search \
4+
simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search \
55
retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
66

77
# Binaries only useful for tests
@@ -800,6 +800,10 @@ gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(O
800800
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
801801
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
802802

803+
eval-callback: examples/eval-callback/eval-callback.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
804+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
805+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
806+
803807
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS)
804808
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
805809
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

common/common.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1745,6 +1745,8 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
17451745
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
17461746
cparams.pooling_type = params.pooling_type;
17471747
cparams.defrag_thold = params.defrag_thold;
1748+
cparams.cb_eval = params.cb_eval;
1749+
cparams.cb_eval_user_data = params.cb_eval_user_data;
17481750
cparams.offload_kqv = !params.no_kv_offload;
17491751

17501752
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
@@ -2192,7 +2194,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
21922194
params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY;
21932195
}
21942196

2195-
{
2197+
if (params.warmup) {
21962198
LOG("warming up the model with an empty run\n");
21972199

21982200
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };

common/common.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ struct gpt_params {
8080
int32_t yarn_orig_ctx = 0; // YaRN original context length
8181
float defrag_thold = -1.0f; // KV cache defragmentation threshold
8282

83+
ggml_backend_sched_eval_callback cb_eval = nullptr;
84+
void * cb_eval_user_data = nullptr;
85+
8386
ggml_numa_strategy numa = GGML_NUMA_STRATEGY_DISABLED;
8487

8588
llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
@@ -156,6 +159,7 @@ struct gpt_params {
156159
bool infill = false; // use infill mode
157160
bool dump_kv_cache = false; // dump the KV cache contents for debugging purposes
158161
bool no_kv_offload = false; // disable KV offloading
162+
bool warmup = true; // warmup run
159163

160164
std::string cache_type_k = "f16"; // KV cache data type for the K
161165
std::string cache_type_v = "f16"; // KV cache data type for the V

docs/HOWTO-add-model.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,8 @@ Have a look to existing implementation like `build_llama`, `build_dbrx` or `buil
100100

101101
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
102102

103+
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
104+
103105
## GGUF specification
104106

105107
https://github.com/ggerganov/ggml/blob/master/docs/gguf.md

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ else()
1919
add_subdirectory(benchmark)
2020
add_subdirectory(convert-llama2c-to-ggml)
2121
add_subdirectory(embedding)
22+
add_subdirectory(eval-callback)
2223
add_subdirectory(finetune)
2324
add_subdirectory(gritlm)
2425
add_subdirectory(gguf-split)

examples/eval-callback/CMakeLists.txt

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
set(TARGET eval-callback)
2+
add_executable(${TARGET} eval-callback.cpp)
3+
install(TARGETS ${TARGET} RUNTIME)
4+
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
5+
target_compile_features(${TARGET} PRIVATE cxx_std_11)
6+
7+
set(TEST_TARGET test-eval-callback)
8+
add_test(NAME ${TEST_TARGET} COMMAND eval-callback --hf-repo ggml-org/models --hf-file tinyllamas/stories260K.gguf --model stories260K.gguf --prompt hello --seed 42)
9+
set_property(TEST ${TEST_TARGET} PROPERTY LABELS eval-callback curl)

examples/eval-callback/README.md

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
# llama.cpp/examples/eval-callback
2+
3+
A simple example which demonstrates how to use callback during the inference.
4+
It simply prints to the console all operations and tensor data.
5+
6+
Usage:
7+
8+
```shell
9+
eval-callback \
10+
--hf-repo ggml-org/models \
11+
--hf-file phi-2/ggml-model-q4_0.gguf \
12+
--model phi-2-q4_0.gguf \
13+
--prompt hello \
14+
--seed 42 \
15+
-ngl 33
16+
```
17+
18+
Will print:
19+
20+
```shell
21+
llm_load_tensors: offloaded 33/33 layers to GPU
22+
...
23+
llama_new_context_with_model: n_ctx = 512
24+
...
25+
llama_new_context_with_model: CUDA0 compute buffer size = 105.00 MiB
26+
llama_new_context_with_model: CUDA_Host compute buffer size = 6.01 MiB
27+
llama_new_context_with_model: graph nodes = 1225
28+
llama_new_context_with_model: graph splits = 2
29+
ggml_debug: inp_embd = (f32) GET_ROWS(token_embd.weight{2560, 51200, 1, 1}, inp_tokens{1, 1, 1, 1}}) = {2560, 1, 1, 1}
30+
[
31+
[
32+
[ -0.0181, 0.0272, 0.0272, ...],
33+
],
34+
]
35+
ggml_debug: norm-0 = (f32) NORM(CUDA0#inp_embd#0{2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
36+
[
37+
[
38+
[ -0.6989, 1.0636, 1.0636, ...],
39+
],
40+
]
41+
ggml_debug: norm_w-0 = (f32) MUL(norm-0{2560, 1, 1, 1}, blk.0.attn_norm.weight{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
42+
[
43+
[
44+
[ -0.1800, 0.2817, 0.2632, ...],
45+
],
46+
]
47+
ggml_debug: attn_norm-0 = (f32) ADD(norm_w-0{2560, 1, 1, 1}, blk.0.attn_norm.bias{2560, 1, 1, 1}}) = {2560, 1, 1, 1}
48+
[
49+
[
50+
[ -0.1863, 0.2970, 0.2604, ...],
51+
],
52+
]
53+
ggml_debug: wqkv-0 = (f32) MUL_MAT(blk.0.attn_qkv.weight{2560, 7680, 1, 1}, attn_norm-0{2560, 1, 1, 1}}) = {7680, 1, 1, 1}
54+
[
55+
[
56+
[ -1.1238, 1.2876, -1.8086, ...],
57+
],
58+
]
59+
ggml_debug: bqkv-0 = (f32) ADD(wqkv-0{7680, 1, 1, 1}, blk.0.attn_qkv.bias{7680, 1, 1, 1}}) = {7680, 1, 1, 1}
60+
[
61+
[
62+
[ -1.1135, 1.4604, -1.9226, ...],
63+
],
64+
]
65+
ggml_debug: bqkv-0 (view) = (f32) VIEW(bqkv-0{7680, 1, 1, 1}, }) = {2560, 1, 1, 1}
66+
[
67+
[
68+
[ -1.1135, 1.4604, -1.9226, ...],
69+
],
70+
]
71+
ggml_debug: Qcur-0 = (f32) CONT(bqkv-0 (view){2560, 1, 1, 1}, }) = {2560, 1, 1, 1}
72+
[
73+
[
74+
[ -1.1135, 1.4604, -1.9226, ...],
75+
],
76+
]
77+
ggml_debug: Qcur-0 (reshaped) = (f32) RESHAPE(Qcur-0{2560, 1, 1, 1}, }) = {80, 32, 1, 1}
78+
[
79+
[
80+
[ -1.1135, 1.4604, -1.9226, ...],
81+
[ -0.3608, 0.5076, -1.8866, ...],
82+
[ 1.7643, 0.0273, -2.1065, ...],
83+
...
84+
],
85+
]
86+
ggml_debug: Qcur-0 = (f32) ROPE(Qcur-0 (reshaped){80, 32, 1, 1}, CUDA0#inp_pos#0{1, 1, 1, 1}}) = {80, 32, 1, 1}
87+
[
88+
[
89+
[ -1.1135, 1.4604, -1.9226, ...],
90+
[ -0.3608, 0.5076, -1.8866, ...],
91+
[ 1.7643, 0.0273, -2.1065, ...],
92+
...
93+
],
94+
]
95+
```

0 commit comments

Comments
 (0)