Skip to content

Commit 5337240

Browse files
committed
Merge remote-tracking branch 'upstream/master'
2 parents 5cc49e6 + 41aee4d commit 5337240

File tree

8 files changed

+165
-36
lines changed

8 files changed

+165
-36
lines changed

CMakeLists.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
8282
option(LLAMA_CUBLAS "llama: use CUDA" OFF)
8383
#option(LLAMA_CUDA_CUBLAS "llama: use cuBLAS for prompt processing" OFF)
8484
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
85+
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
8586
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
8687
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
8788
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
@@ -305,6 +306,9 @@ if (LLAMA_CUBLAS)
305306
if (LLAMA_CUDA_FORCE_DMMV)
306307
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
307308
endif()
309+
if (LLAMA_CUDA_FORCE_MMQ)
310+
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
311+
endif()
308312
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
309313
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
310314
if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -405,6 +409,9 @@ if (LLAMA_HIPBLAS)
405409
if (LLAMA_CUDA_FORCE_DMMV)
406410
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_DMMV)
407411
endif()
412+
if (LLAMA_CUDA_FORCE_MMQ)
413+
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_FORCE_MMQ)
414+
endif()
408415
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
409416
target_compile_definitions(ggml-rocm PRIVATE GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
410417
target_compile_definitions(ggml-rocm PRIVATE K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -397,6 +397,9 @@ endif # CUDA_DOCKER_ARCH
397397
ifdef LLAMA_CUDA_FORCE_DMMV
398398
NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
399399
endif # LLAMA_CUDA_FORCE_DMMV
400+
ifdef LLAMA_CUDA_FORCE_MMQ
401+
NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
402+
endif # LLAMA_CUDA_FORCE_MMQ
400403
ifdef LLAMA_CUDA_DMMV_X
401404
NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(LLAMA_CUDA_DMMV_X)
402405
else

examples/server/server.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1502,7 +1502,7 @@ struct llama_server_context
15021502
{
15031503
for (auto & slot : slots)
15041504
{
1505-
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
1505+
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty()) || !slot.images.empty();
15061506

15071507
// empty prompt passed -> release the slot and send empty response
15081508
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)

examples/simple/simple.cpp

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -95,13 +95,8 @@ int main(int argc, char ** argv) {
9595
llama_batch batch = llama_batch_init(512, 0, 1);
9696

9797
// evaluate the initial prompt
98-
batch.n_tokens = tokens_list.size();
99-
100-
for (int32_t i = 0; i < batch.n_tokens; i++) {
101-
batch.token[i] = tokens_list[i];
102-
batch.pos[i] = i;
103-
batch.seq_id[i] = 0;
104-
batch.logits[i] = false;
98+
for (size_t i = 0; i < tokens_list.size(); i++) {
99+
llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
105100
}
106101

107102
// llama_decode will output logits only for the last token of the prompt
@@ -148,15 +143,10 @@ int main(int argc, char ** argv) {
148143
fflush(stdout);
149144

150145
// prepare the next batch
151-
batch.n_tokens = 0;
146+
llama_batch_clear(batch);
152147

153148
// push this new token for next evaluation
154-
batch.token [batch.n_tokens] = new_token_id;
155-
batch.pos [batch.n_tokens] = n_cur;
156-
batch.seq_id[batch.n_tokens] = 0;
157-
batch.logits[batch.n_tokens] = true;
158-
159-
batch.n_tokens += 1;
149+
llama_batch_add(batch, new_token_id, n_cur, { 0 }, true);
160150

161151
n_decode += 1;
162152
}

examples/speculative/speculative.cpp

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88
#include <string>
99
#include <vector>
1010

11+
#define SPEC_VOCAB_MAX_SIZE_DIFFERENCE 100
12+
#define SPEC_VOCAB_CHECK_START_TOKEN_ID 5
13+
1114
struct seq_draft {
1215
bool active = false;
1316
bool drafting = false;
@@ -64,6 +67,33 @@ int main(int argc, char ** argv) {
6467
params.n_gpu_layers = params.n_gpu_layers_draft;
6568
std::tie(model_dft, ctx_dft) = llama_init_from_gpt_params(params);
6669

70+
{
71+
const int n_vocab_tgt = llama_n_vocab(model_tgt);
72+
const int n_vocab_dft = llama_n_vocab(model_dft);
73+
const int vocab_diff = n_vocab_tgt > n_vocab_dft
74+
? n_vocab_tgt - n_vocab_dft
75+
: n_vocab_dft - n_vocab_tgt;
76+
77+
if (vocab_diff > SPEC_VOCAB_MAX_SIZE_DIFFERENCE) {
78+
fprintf(stderr, "%s: error: draft model vocab must closely match target model to use speculation but ", __func__);
79+
fprintf(stderr, "target vocab size %d does not match draft vocab size %d - difference %d, max allowed %d\n",
80+
n_vocab_tgt, llama_n_vocab(model_dft), vocab_diff, SPEC_VOCAB_MAX_SIZE_DIFFERENCE);
81+
return 1;
82+
}
83+
84+
for (int i = SPEC_VOCAB_CHECK_START_TOKEN_ID; i < std::min(n_vocab_tgt, n_vocab_dft); ++i) {
85+
const char * token_text_tgt = llama_token_get_text(model_tgt, i);
86+
const char * token_text_dft = llama_token_get_text(model_dft, i);
87+
if (std::strcmp(token_text_tgt, token_text_dft) != 0) {
88+
fprintf(stderr, "%s: error: draft model vocab must match target model to use speculation but ", __func__);
89+
fprintf(stderr, "token %d content differs - target '%s', draft '%s'\n", i,
90+
llama_token_to_piece(ctx_tgt, i).c_str(),
91+
llama_token_to_piece(ctx_dft, i).c_str());
92+
return 1;
93+
}
94+
}
95+
}
96+
6797
// tokenize the prompt
6898
std::vector<llama_token> inp;
6999
inp = ::llama_tokenize(ctx_tgt, params.prompt, true);
@@ -227,6 +257,7 @@ int main(int argc, char ** argv) {
227257
llama_batch_add (batch_dft, id, n_past_dft, { 0 }, true);
228258

229259
llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
260+
// LOG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
230261
llama_decode (ctx_dft, batch_dft);
231262

232263
++n_past_dft;
@@ -370,7 +401,7 @@ int main(int argc, char ** argv) {
370401
llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
371402
}
372403

373-
//LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt));
404+
// LOG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
374405
llama_decode(ctx_tgt, batch_tgt);
375406
++n_past_tgt;
376407
}

0 commit comments

Comments
 (0)