Skip to content

Commit 2c1c46a

Browse files
authored
Merge pull request #4 from l3utterfly/master
merge master into layla build
2 parents 210b644 + caa106d commit 2c1c46a

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+5693
-4361
lines changed

.github/workflows/build.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,8 @@ jobs:
425425
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
426426
- build: 'vulkan'
427427
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
428+
- build: 'arm64'
429+
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
428430

429431
steps:
430432
- name: Clone
@@ -520,7 +522,7 @@ jobs:
520522
- name: Test
521523
id: cmake_test
522524
# not all machines have native AVX-512
523-
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
525+
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
524526
run: |
525527
cd build
526528
ctest -L main -C Release --verbose --timeout 900

.github/workflows/server.yml

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,8 @@ jobs:
4747
- name: Clone
4848
id: checkout
4949
uses: actions/checkout@v3
50+
with:
51+
fetch-depth: 0
5052

5153
- name: Dependencies
5254
id: depends
@@ -58,7 +60,6 @@ jobs:
5860
cmake \
5961
python3-pip \
6062
wget \
61-
psmisc \
6263
language-pack-en
6364
6465
- name: Build
@@ -90,3 +91,46 @@ jobs:
9091
run: |
9192
cd examples/server/tests
9293
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow
94+
95+
96+
server-windows:
97+
runs-on: windows-latest
98+
99+
steps:
100+
- name: Clone
101+
id: checkout
102+
uses: actions/checkout@v3
103+
with:
104+
fetch-depth: 0
105+
106+
- name: Build
107+
id: cmake_build
108+
run: |
109+
mkdir build
110+
cd build
111+
cmake .. -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
112+
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
113+
114+
- name: Python setup
115+
id: setup_python
116+
uses: actions/setup-python@v5
117+
with:
118+
python-version: '3.11'
119+
120+
- name: Tests dependencies
121+
id: test_dependencies
122+
run: |
123+
pip install -r examples/server/tests/requirements.txt
124+
125+
- name: Tests
126+
id: server_integration_tests
127+
run: |
128+
cd examples/server/tests
129+
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
130+
131+
- name: Slow tests
132+
id: server_integration_tests_slow
133+
if: ${{ github.event.schedule != '' || github.event.inputs.slow_tests == 'true' }}
134+
run: |
135+
cd examples/server/tests
136+
behave.exe --stop --no-skipped --no-capture --tags slow

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ models-mnt
4646
/embedding
4747
/gguf
4848
/gguf-llama-simple
49+
/gritlm
4950
/imatrix
5051
/infill
5152
/libllama.so

CMakeLists.txt

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,7 @@ option(LLAMA_MPI "llama: use MPI"
116116
option(LLAMA_QKK_64 "llama: use super-block size of 64 for k-quants" OFF)
117117
option(LLAMA_SYCL "llama: use SYCL" OFF)
118118
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
119+
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
119120
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)
120121

121122
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
@@ -199,14 +200,15 @@ if (LLAMA_METAL)
199200
# get full path to the file
200201
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
201202

202-
# copy ggml-metal.metal to bin directory
203+
# copy ggml-common.h and ggml-metal.metal to bin directory
204+
configure_file(ggml-common.h ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h COPYONLY)
203205
configure_file(ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
204206

205207
if (LLAMA_METAL_EMBED_LIBRARY)
206208
enable_language(ASM)
207209
add_compile_definitions(GGML_METAL_EMBED_LIBRARY)
208210

209-
set(METALLIB_SOURCE "${CMAKE_SOURCE_DIR}/ggml-metal.metal")
211+
set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
210212
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
211213
set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
212214

@@ -477,6 +479,10 @@ if (LLAMA_HIPBLAS)
477479
endif()
478480

479481
if (LLAMA_SYCL)
482+
if (NOT LLAMA_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
483+
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
484+
endif()
485+
480486
if ( NOT DEFINED ENV{ONEAPI_ROOT})
481487
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
482488
endif()
@@ -498,14 +504,21 @@ if (LLAMA_SYCL)
498504
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
499505
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
500506
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
507+
if (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
508+
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
509+
endif()
501510

502511
set(GGML_HEADERS_SYCL ggml-sycl.h)
503512
set(GGML_SOURCES_SYCL ggml-sycl.cpp)
504513

505514
if (WIN32)
506515
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl sycl7 OpenCL mkl_sycl_blas_dll.lib mkl_intel_ilp64_dll.lib mkl_sequential_dll.lib mkl_core_dll.lib)
507516
else()
508-
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
517+
if (LLAMA_SYCL_TARGET STREQUAL "INTEL")
518+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
519+
elseif (LLAMA_SYCL_TARGET STREQUAL "NVIDIA")
520+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} -fsycl pthread m dl onemkl)
521+
endif()
509522
endif()
510523
endif()
511524

Makefile

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
BUILD_TARGETS = \
33
main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \
44
simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \
5-
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o
5+
speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o
66

77
# Binaries only useful for tests
88
TEST_TARGETS = \
@@ -201,6 +201,10 @@ ifdef LLAMA_SERVER_VERBOSE
201201
MK_CPPFLAGS += -DSERVER_VERBOSE=$(LLAMA_SERVER_VERBOSE)
202202
endif
203203

204+
ifdef LLAMA_SERVER_SSL
205+
MK_CPPFLAGS += -DCPPHTTPLIB_OPENSSL_SUPPORT
206+
MK_LDFLAGS += -lssl -lcrypto
207+
endif
204208

205209
ifdef LLAMA_CODE_COVERAGE
206210
MK_CXXFLAGS += -fprofile-arcs -ftest-coverage -dumpbase ''
@@ -449,7 +453,7 @@ endif # LLAMA_CUDA_PEER_MAX_BATCH_SIZE
449453
ifdef LLAMA_CUDA_CCBIN
450454
MK_NVCCFLAGS += -ccbin $(LLAMA_CUDA_CCBIN)
451455
endif
452-
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
456+
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-common.h
453457
ifdef JETSON_EOL_MODULE_DETECT
454458
$(NVCC) -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I/usr/local/cuda/targets/aarch64-linux/include -std=c++11 -O3 $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
455459
else
@@ -626,7 +630,7 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
626630
ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
627631
$(CC) $(CFLAGS) -c $< -o $@
628632

629-
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h
633+
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
630634
$(CC) $(CFLAGS) -c $< -o $@
631635

632636
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
@@ -720,6 +724,10 @@ embedding: examples/embedding/embedding.cpp ggml.o llama.o $(C
720724
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
721725
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
722726

727+
gritlm: examples/gritlm/gritlm.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
728+
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
729+
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
730+
723731
save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
724732
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
725733
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)

README-sycl.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,29 @@ For iGPU, please make sure the shared memory from host memory is enough. For lla
7373

7474
For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, recommend the device memory is 4GB+.
7575

76+
## Nvidia GPU
77+
78+
### Verified
79+
80+
|Intel GPU| Status | Verified Model|
81+
|-|-|-|
82+
|Ampere Series| Support| A100|
83+
84+
### oneMKL
85+
86+
The current oneMKL release does not contain the oneMKL cuBlas backend.
87+
As a result for Nvidia GPU's oneMKL must be built from source.
88+
89+
```
90+
git clone https://github.com/oneapi-src/oneMKL
91+
cd oneMKL
92+
mkdir build
93+
cd build
94+
cmake -G Ninja .. -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON
95+
ninja
96+
// Add paths as necessary
97+
```
98+
7699
## Docker
77100

78101
Note:
@@ -186,6 +209,9 @@ source /opt/intel/oneapi/setvars.sh
186209
# Or, for FP32:
187210
cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
188211

212+
# For Nvidia GPUs
213+
cmake .. -DLLAMA_SYCL=ON -DLLAMA_SYCL_TARGET=NVIDIA -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
214+
189215
# Build example/main only
190216
#cmake --build . --config Release --target main
191217

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,15 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1010

1111
### Recent API changes
1212

13+
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
1314
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
1415
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
1516

1617
### Hot topics
1718

18-
- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
19-
- Support for chat templates: [Uncyclo (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
20-
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
21-
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
22-
- Looking for contributions to improve and maintain the `server` example: https://github.com/ggerganov/llama.cpp/issues/4216
19+
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
20+
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
21+
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
2322

2423
----
2524

@@ -110,6 +109,7 @@ Typically finetunes of the base models below are supported as well.
110109
- [x] [InternLM2](https://huggingface.co/models?search=internlm2)
111110
- [x] [CodeShell](https://github.com/WisdomShell/codeshell)
112111
- [x] [Gemma](https://ai.google.dev/gemma)
112+
- [x] [Mamba](https://github.com/state-spaces/mamba)
113113

114114
**Multimodal models:**
115115

common/common.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,6 +1288,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
12881288

12891289
cparams.n_ctx = params.n_ctx;
12901290
cparams.n_batch = params.n_batch;
1291+
cparams.n_parallel = params.n_parallel;
12911292
cparams.n_threads = params.n_threads;
12921293
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
12931294
cparams.seed = params.seed;
@@ -1851,3 +1852,18 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
18511852

18521853
printf("\n=== Done dumping\n");
18531854
}
1855+
1856+
void llama_embd_normalize(const float * inp, float * out, int n) {
1857+
double sum = 0.0;
1858+
for (int i = 0; i < n; i++) {
1859+
sum += inp[i] * inp[i];
1860+
}
1861+
sum = sqrt(sum);
1862+
1863+
const float norm = sum > 0.0 ? 1.0f / sum : 0.0f;
1864+
1865+
for (int i = 0; i < n; i++) {
1866+
out[i] = inp[i] * norm;
1867+
}
1868+
}
1869+

common/common.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,3 +260,10 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80);
260260

261261
// Dump the KV cache view showing individual sequences in each cell (long output).
262262
void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
263+
264+
//
265+
// Embedding utils
266+
//
267+
268+
void llama_embd_normalize(const float * inp, float * out, int n);
269+

common/grammar-parser.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,22 @@ namespace grammar_parser {
278278
while (*pos) {
279279
pos = parse_rule(state, pos);
280280
}
281+
// Validate the state to ensure that all rules are defined
282+
for (const auto & rule : state.rules) {
283+
for (const auto & elem : rule) {
284+
if (elem.type == LLAMA_GRETYPE_RULE_REF) {
285+
// Ensure that the rule at that location exists
286+
if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) {
287+
// Get the name of the rule that is missing
288+
for (const auto & kv : state.symbol_ids) {
289+
if (kv.second == elem.value) {
290+
throw std::runtime_error("Undefined rule identifier '" + kv.first + "'");
291+
}
292+
}
293+
}
294+
}
295+
}
296+
}
281297
return state;
282298
} catch (const std::exception & err) {
283299
fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what());

0 commit comments

Comments
 (0)