Skip to content

Commit 6075b27

Browse files
committed
Merge branch 'master' into gg/ggml-common-decl
2 parents f33ab14 + 332bdfd commit 6075b27

File tree

15 files changed

+1646
-454
lines changed

15 files changed

+1646
-454
lines changed

CMakeLists.txt

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ option(LLAMA_MPI "llama: use MPI"
116116
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
117117
option(LLAMA_SYCL "llama: use SYCL" OFF)
118118
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
119+
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
119120
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)
120121

121122
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
@@ -207,7 +208,7 @@ if (LLAMA_METAL)
207208
enable_language(ASM)
208209
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
209210

210-
set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
211+
set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
211212
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
212213
set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
213214

@@ -534,6 +535,10 @@ if (LLAMA_HIPBLAS)
534535
endif()
535536

536537
if (LLAMA_SYCL)
538+
if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
539+
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
540+
endif()
541+
537542
if ( NOT DEFINED ENV{ONEAPI_ROOT})
538543
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
539544
endif()
@@ -555,14 +560,21 @@ if (LLAMA_SYCL)
555560
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
556561
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
557562
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
563+
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
564+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
565+
endif()
558566

559567
set(GGML_HEADERS_SYCL ggml-sycl.h)
560568
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
561569

562570
if (WIN32)
563571
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
564572
else()
565-
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
573+
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
574+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
575+
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
576+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl pthread m dl onemkl)
577+
endif()
566578
endif()
567579
endif()
568580

README-sycl.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,29 @@ For iGPU, please make sure the shared memory from host memory is enough. For lla
7373

7474
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
7575

76+
## Nvidia GPU
77+
78+
### Verified
79+
80+
|Intel GPU| Status | Verified Model|
81+
|-|-|-|
82+
|Ampere Series| Support| A100|
83+
84+
### oneMKL
85+
86+
The current oneMKL release does not contain the oneMKL cuBlas backend.
87+
As a result for Nvidia GPU's oneMKL must be built from source.
88+
89+
```
90+
git clone https://github.com/oneapi-src/oneMKL
91+
cd oneMKL
92+
mkdir build
93+
cd build
94+
cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON
95+
ninja
96+
// Add paths as necessary
97+
```
98+
7699
## Docker
77100

78101
Note:
@@ -186,6 +209,9 @@ source /opt/intel/oneapi/setvars.sh
186209
# Or, for FP32:
187210
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
188211

212+
# For Nvidia GPUs
213+
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
214+
189215
# Build example/main only
190216
#cmake --build . --config Release --target main
191217

README.md

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,6 @@
88

99
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
1010

11-
> [!IMPORTANT]
12-
> **Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962**
13-
>
14-
> Vote for which quantization type provides better responses, all other parameters being the same.
15-
1611
### Recent API changes
1712

1813
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
@@ -21,6 +16,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
2116

2217
### Hot topics
2318

19+
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
20+
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
2421
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
2522

2623
----

examples/llama.android/app/src/main/cpp/llama-android.cpp

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,45 @@ jclass la_int_var;
3333
jmethodID la_int_var_value;
3434
jmethodID la_int_var_inc;
3535

36+
std::string cached_token_chars;
37+
38+
bool is_valid_utf8(const char * string) {
39+
if (!string) {
40+
return true;
41+
}
42+
43+
const unsigned char * bytes = (const unsigned char *)string;
44+
int num;
45+
46+
while (*bytes != 0x00) {
47+
if ((*bytes & 0x80) == 0x00) {
48+
// U+0000 to U+007F
49+
num = 1;
50+
} else if ((*bytes & 0xE0) == 0xC0) {
51+
// U+0080 to U+07FF
52+
num = 2;
53+
} else if ((*bytes & 0xF0) == 0xE0) {
54+
// U+0800 to U+FFFF
55+
num = 3;
56+
} else if ((*bytes & 0xF8) == 0xF0) {
57+
// U+10000 to U+10FFFF
58+
num = 4;
59+
} else {
60+
return false;
61+
}
62+
63+
bytes += 1;
64+
for (int i = 1; i < num; ++i) {
65+
if ((*bytes & 0xC0) != 0x80) {
66+
return false;
67+
}
68+
bytes += 1;
69+
}
70+
}
71+
72+
return true;
73+
}
74+
3675
static void log_callback(ggml_log_level level, const char * fmt, void * data) {
3776
if (level == GGML_LOG_LEVEL_ERROR) __android_log_print(ANDROID_LOG_ERROR, TAG, fmt, data);
3877
else if (level == GGML_LOG_LEVEL_INFO) __android_log_print(ANDROID_LOG_INFO, TAG, fmt, data);
@@ -295,6 +334,8 @@ Java_com_example_llama_Llm_completion_1init(
295334
jint n_len
296335
) {
297336

337+
cached_token_chars.clear();
338+
298339
const auto text = env->GetStringUTFChars(jtext, 0);
299340
const auto context = reinterpret_cast<llama_context *>(context_pointer);
300341
const auto batch = reinterpret_cast<llama_batch *>(batch_pointer);
@@ -372,8 +413,16 @@ Java_com_example_llama_Llm_completion_1loop(
372413
}
373414

374415
auto new_token_chars = llama_token_to_piece(context, new_token_id);
375-
LOGi("new_token_chars: `%s`", new_token_chars.c_str());
376-
auto new_token = env->NewStringUTF(new_token_chars.c_str());
416+
cached_token_chars += new_token_chars;
417+
418+
jstring new_token = nullptr;
419+
if (is_valid_utf8(cached_token_chars.c_str())) {
420+
new_token = env->NewStringUTF(cached_token_chars.c_str());
421+
LOGi("cached: %s, new_token_chars: `%s`, id: %d", cached_token_chars.c_str(), new_token_chars.c_str(), new_token_id);
422+
cached_token_chars.clear();
423+
} else {
424+
new_token = env->NewStringUTF("");
425+
}
377426

378427
llama_batch_clear(*batch);
379428
llama_batch_add(*batch, new_token_id, n_cur, { 0 }, true);

examples/llama.android/app/src/main/java/com/example/llama/Llm.kt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class Llm {
7171
batch: Long,
7272
nLen: Int,
7373
ncur: IntVar
74-
): String
74+
): String?
7575

7676
private external fun kv_cache_clear(context: Long)
7777

@@ -115,7 +115,7 @@ class Llm {
115115
val ncur = IntVar(completion_init(state.context, state.batch, message, nlen))
116116
while (ncur.value <= nlen) {
117117
val str = completion_loop(state.context, state.batch, nlen, ncur)
118-
if (str.isEmpty()) {
118+
if (str == null) {
119119
break
120120
}
121121
emit(str)

examples/server/server.cpp

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3195,11 +3195,12 @@ int main(int argc, char ** argv) {
31953195
ctx_server.queue_results.add_waiting_task_id(id_task);
31963196
ctx_server.request_completion(id_task, -1, data, false, false);
31973197

3198+
const auto completion_id = gen_chatcmplid();
31983199
if (!json_value(data, "stream", false)) {
31993200
server_task_result result = ctx_server.queue_results.recv(id_task);
32003201

32013202
if (!result.error && result.stop) {
3202-
json result_oai = format_final_response_oaicompat(data, result.data);
3203+
json result_oai = format_final_response_oaicompat(data, result.data, completion_id);
32033204

32043205
res.set_content(result_oai.dump(-1, ' ', false, json::error_handler_t::replace), "application/json; charset=utf-8");
32053206
} else {
@@ -3208,11 +3209,11 @@ int main(int argc, char ** argv) {
32083209
}
32093210
ctx_server.queue_results.remove_waiting_task_id(id_task);
32103211
} else {
3211-
const auto chunked_content_provider = [id_task, &ctx_server](size_t, httplib::DataSink & sink) {
3212+
const auto chunked_content_provider = [id_task, &ctx_server, completion_id](size_t, httplib::DataSink & sink) {
32123213
while (true) {
32133214
server_task_result result = ctx_server.queue_results.recv(id_task);
32143215
if (!result.error) {
3215-
std::vector<json> result_array = format_partial_response_oaicompat(result.data);
3216+
std::vector<json> result_array = format_partial_response_oaicompat(result.data, completion_id);
32163217

32173218
for (auto it = result_array.begin(); it != result_array.end(); ++it) {
32183219
if (!it->empty()) {

examples/server/utils.hpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,7 @@ static json oaicompat_completion_params_parse(
378378
return llama_params;
379379
}
380380

381-
static json format_final_response_oaicompat(const json & request, json result, bool streaming = false) {
381+
static json format_final_response_oaicompat(const json & request, json result, const std::string & completion_id, bool streaming = false) {
382382
bool stopped_word = result.count("stopped_word") != 0;
383383
bool stopped_eos = json_value(result, "stopped_eos", false);
384384
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
@@ -412,7 +412,7 @@ static json format_final_response_oaicompat(const json & request, json result, b
412412
{"prompt_tokens", num_prompt_tokens},
413413
{"total_tokens", num_tokens_predicted + num_prompt_tokens}
414414
}},
415-
{"id", gen_chatcmplid()}
415+
{"id", completion_id}
416416
};
417417

418418
if (server_verbose) {
@@ -427,7 +427,7 @@ static json format_final_response_oaicompat(const json & request, json result, b
427427
}
428428

429429
// return value is vector as there is one case where we might need to generate two responses
430-
static std::vector<json> format_partial_response_oaicompat(json result) {
430+
static std::vector<json> format_partial_response_oaicompat(json result, const std::string & completion_id) {
431431
if (!result.contains("model") || !result.contains("oaicompat_token_ctr")) {
432432
return std::vector<json>({result});
433433
}
@@ -471,7 +471,7 @@ static std::vector<json> format_partial_response_oaicompat(json result) {
471471
{"role", "assistant"}
472472
}}}})},
473473
{"created", t},
474-
{"id", gen_chatcmplid()},
474+
{"id", completion_id},
475475
{"model", modelname},
476476
{"object", "chat.completion.chunk"}};
477477

@@ -482,7 +482,7 @@ static std::vector<json> format_partial_response_oaicompat(json result) {
482482
{"content", content}}}
483483
}})},
484484
{"created", t},
485-
{"id", gen_chatcmplid()},
485+
{"id", completion_id},
486486
{"model", modelname},
487487
{"object", "chat.completion.chunk"}};
488488

@@ -509,7 +509,7 @@ static std::vector<json> format_partial_response_oaicompat(json result) {
509509
json ret = json {
510510
{"choices", choices},
511511
{"created", t},
512-
{"id", gen_chatcmplid()},
512+
{"id", completion_id},
513513
{"model", modelname},
514514
{"object", "chat.completion.chunk"}
515515
};

0 commit comments

Comments
 (0)