Skip to content

Commit a869386

Browse files
author
mike dupont
committed
grammars
1 parent 50e8ccc commit a869386

File tree

3 files changed

+32
-2
lines changed

3 files changed

+32
-2
lines changed

common/grammar-parser.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ namespace grammar_parser {
5151
}
5252

5353
static bool is_word_char(char c) {
54-
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
54+
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || c == '_' || ('0' <= c && c <= '9');
5555
}
5656

5757
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {

grammars/cublas.gebnf

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

llama.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1494,6 +1494,7 @@ static bool llama_kv_cache_init(
14941494
ggml_type wtype,
14951495
uint32_t n_ctx,
14961496
int n_gpu_layers) {
1497+
fprintf(stderr, "GPULAYERS '%d'\n", n_gpu_layers);
14971498
const uint32_t n_embd = hparams.n_embd_gqa();
14981499
const uint32_t n_layer = hparams.n_layer;
14991500

@@ -1531,6 +1532,7 @@ static bool llama_kv_cache_init(
15311532
(void) n_gpu_layers;
15321533

15331534
#ifdef GGML_USE_CUBLAS
1535+
fprintf(stderr, "USE CUBLAS\n");
15341536
if (ggml_cublas_loaded()) {
15351537
size_t vram_kv_cache = 0;
15361538

@@ -1548,6 +1550,8 @@ static bool llama_kv_cache_init(
15481550
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MiB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
15491551
}
15501552
}
1553+
#else
1554+
fprintf(stderr, "NO USE CUBLAS\n");
15511555
#endif
15521556

15531557
return true;
@@ -2065,6 +2069,7 @@ struct llama_model_loader {
20652069
break;
20662070
#ifdef GGML_USE_CUBLAS
20672071
case GGML_BACKEND_GPU:
2072+
20682073
case GGML_BACKEND_GPU_SPLIT:
20692074
// old code:
20702075
//ggml_cuda_transform_tensor(lt.data, lt.ggml_tensor);
@@ -2741,9 +2746,11 @@ static void llm_load_tensors(
27412746
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
27422747

27432748
if (backend_norm == GGML_BACKEND_GPU) {
2749+
fprintf(stderr, "vram_weights00 '%ld'\n", vram_weights);
27442750
vram_weights += ggml_nbytes(model.output_norm);
27452751
}
27462752
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2753+
fprintf(stderr, "vram_weights01 '%ld'\n", vram_weights);
27472754
vram_weights += ggml_nbytes(model.output);
27482755
}
27492756
}
@@ -2774,6 +2781,7 @@ static void llm_load_tensors(
27742781
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
27752782

27762783
if (backend == GGML_BACKEND_GPU) {
2784+
fprintf(stderr, "vram_weights03 '%ld'\n", vram_weights);
27772785
vram_weights +=
27782786
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
27792787
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
@@ -2807,9 +2815,11 @@ static void llm_load_tensors(
28072815
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
28082816

28092817
if (backend_norm == GGML_BACKEND_GPU) {
2818+
fprintf(stderr, "vram_weights04 '%ld'\n", vram_weights);
28102819
vram_weights += ggml_nbytes(model.output_norm);
28112820
}
28122821
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2822+
fprintf(stderr, "vram_weights05 '%ld'\n", vram_weights);
28132823
vram_weights += ggml_nbytes(model.output);
28142824
}
28152825
}
@@ -2840,6 +2850,7 @@ static void llm_load_tensors(
28402850
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
28412851

28422852
if (backend == GGML_BACKEND_GPU) {
2853+
fprintf(stderr, "vram_weights06 '%ld'\n", vram_weights);
28432854
vram_weights +=
28442855
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
28452856
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
@@ -2878,10 +2889,13 @@ static void llm_load_tensors(
28782889
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
28792890

28802891
if (backend_norm == GGML_BACKEND_GPU) {
2892+
fprintf(stderr, "vram_weights07 '%ld'\n", vram_weights);
28812893
vram_weights += ggml_nbytes(model.output_norm);
2894+
fprintf(stderr, "vram_weights08 '%ld'\n", vram_weights);
28822895
vram_weights += ggml_nbytes(model.output_norm_b);
28832896
}
28842897
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2898+
fprintf(stderr, "vram_weights09 '%ld'\n", vram_weights);
28852899
vram_weights += ggml_nbytes(model.output);
28862900
}
28872901
}
@@ -2906,7 +2920,9 @@ static void llm_load_tensors(
29062920
layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
29072921

29082922
if (backend == GGML_BACKEND_GPU) {
2923+
fprintf(stderr, "vram_weights10 '%ld'\n", vram_weights);
29092924
vram_weights += ggml_nbytes(layer.attn_norm_2);
2925+
fprintf(stderr, "vram_weights11 '%ld'\n", vram_weights);
29102926
vram_weights += ggml_nbytes(layer.attn_norm_2_b);
29112927
}
29122928
}
@@ -2918,6 +2934,7 @@ static void llm_load_tensors(
29182934
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
29192935

29202936
if (backend == GGML_BACKEND_GPU) {
2937+
fprintf(stderr, "vram_weights12 '%ld'\n", vram_weights);
29212938
vram_weights +=
29222939
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
29232940
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.wo) +
@@ -2955,10 +2972,12 @@ static void llm_load_tensors(
29552972
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
29562973

29572974
if (backend_norm == GGML_BACKEND_GPU) {
2975+
fprintf(stderr, "vram_weights13 '%ld'\n", vram_weights);
29582976
vram_weights += ggml_nbytes(model.output_norm);
29592977
vram_weights += ggml_nbytes(model.output_norm_b);
29602978
}
29612979
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2980+
fprintf(stderr, "vram_weights14 '%ld'\n", vram_weights);
29622981
vram_weights += ggml_nbytes(model.output);
29632982
}
29642983
}
@@ -2994,6 +3013,7 @@ static void llm_load_tensors(
29943013
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
29953014

29963015
if (backend == GGML_BACKEND_GPU) {
3016+
fprintf(stderr, "vram_weights15 '%ld'\n", vram_weights);
29973017
vram_weights +=
29983018
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
29993019
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
@@ -3039,10 +3059,13 @@ static void llm_load_tensors(
30393059
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
30403060

30413061
if (backend_norm == GGML_BACKEND_GPU) {
3062+
fprintf(stderr, "vram_weights16 '%ld'\n", vram_weights);
30423063
vram_weights += ggml_nbytes(model.output_norm);
3064+
fprintf(stderr, "vram_weights17 '%ld'\n", vram_weights);
30433065
vram_weights += ggml_nbytes(model.output_norm_b);
30443066
}
30453067
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3068+
fprintf(stderr, "vram_weights18 '%ld'\n", vram_weights);
30463069
vram_weights += ggml_nbytes(model.output);
30473070
}
30483071
}
@@ -3105,10 +3128,13 @@ static void llm_load_tensors(
31053128
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
31063129

31073130
if (backend_norm == GGML_BACKEND_GPU) {
3131+
fprintf(stderr, "vram_weights19 '%ld'\n", vram_weights);
31083132
vram_weights += ggml_nbytes(model.output_norm);
3133+
fprintf(stderr, "vram_weights20 '%ld'\n", vram_weights);
31093134
vram_weights += ggml_nbytes(model.output_norm_b);
31103135
}
31113136
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3137+
fprintf(stderr, "vram_weights21 '%ld'\n", vram_weights);
31123138
vram_weights += ggml_nbytes(model.output);
31133139
}
31143140
}
@@ -3144,6 +3170,7 @@ static void llm_load_tensors(
31443170
layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
31453171

31463172
if (backend == GGML_BACKEND_GPU) {
3173+
fprintf(stderr, "vram_weights22 '%ld'\n", vram_weights);
31473174
vram_weights +=
31483175
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.attn_norm_b) +
31493176
ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
@@ -3182,9 +3209,11 @@ static void llm_load_tensors(
31823209
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
31833210

31843211
if (backend_norm == GGML_BACKEND_GPU) {
3212+
fprintf(stderr, "vram_weights23 '%ld'\n", vram_weights);
31853213
vram_weights += ggml_nbytes(model.output_norm);
31863214
}
31873215
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
3216+
fprintf(stderr, "vram_weights24 '%ld'\n", vram_weights);
31883217
vram_weights += ggml_nbytes(model.output);
31893218
}
31903219
}
@@ -3211,6 +3240,7 @@ static void llm_load_tensors(
32113240
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
32123241

32133242
if (backend == GGML_BACKEND_GPU) {
3243+
fprintf(stderr, "vram_weights25 '%ld'\n", vram_weights);
32143244
vram_weights +=
32153245
ggml_nbytes(layer.attn_norm) +
32163246
ggml_nbytes(layer.wqkv) +

0 commit comments

Comments
 (0)