Skip to content

Commit 210b644

Browse files
committed
Merge branch 'master' into layla-build
2 parents 1d0fba9 + 581ed5c commit 210b644

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+53612
-50972
lines changed

.devops/nix/package.nix

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
{
22
lib,
3+
glibc,
34
config,
45
stdenv,
56
mkShell,
@@ -30,6 +31,11 @@
3031
useRocm ? config.rocmSupport,
3132
useVulkan ? false,
3233
llamaVersion ? "0.0.0", # Arbitrary version, substituted by the flake
34+
35+
# It's necessary to consistently use backendStdenv when building with CUDA support,
36+
# otherwise we get libstdc++ errors downstream.
37+
effectiveStdenv ? if useCuda then cudaPackages.backendStdenv else stdenv,
38+
enableStatic ? effectiveStdenv.hostPlatform.isStatic
3339
}@inputs:
3440

3541
let
@@ -41,10 +47,7 @@ let
4147
versionOlder
4248
;
4349

44-
# It's necessary to consistently use backendStdenv when building with CUDA support,
45-
# otherwise we get libstdc++ errors downstream.
4650
stdenv = throw "Use effectiveStdenv instead";
47-
effectiveStdenv = if useCuda then cudaPackages.backendStdenv else inputs.stdenv;
4851

4952
suffices =
5053
lib.optionals useBlas [ "BLAS" ]
@@ -167,6 +170,9 @@ effectiveStdenv.mkDerivation (
167170
# TODO: Replace with autoAddDriverRunpath
168171
# once https://github.com/NixOS/nixpkgs/pull/275241 has been merged
169172
cudaPackages.autoAddOpenGLRunpathHook
173+
]
174+
++ optionals (effectiveStdenv.hostPlatform.isGnu && enableStatic) [
175+
glibc.static
170176
];
171177

172178
buildInputs =
@@ -181,7 +187,7 @@ effectiveStdenv.mkDerivation (
181187
[
182188
(cmakeBool "LLAMA_NATIVE" false)
183189
(cmakeBool "LLAMA_BUILD_SERVER" true)
184-
(cmakeBool "BUILD_SHARED_LIBS" true)
190+
(cmakeBool "BUILD_SHARED_LIBS" (!enableStatic))
185191
(cmakeBool "CMAKE_SKIP_BUILD_RPATH" true)
186192
(cmakeBool "LLAMA_BLAS" useBlas)
187193
(cmakeBool "LLAMA_CLBLAST" useOpenCL)
@@ -190,6 +196,7 @@ effectiveStdenv.mkDerivation (
190196
(cmakeBool "LLAMA_METAL" useMetalKit)
191197
(cmakeBool "LLAMA_MPI" useMpi)
192198
(cmakeBool "LLAMA_VULKAN" useVulkan)
199+
(cmakeBool "LLAMA_STATIC" enableStatic)
193200
]
194201
++ optionals useCuda [
195202
(

.devops/nix/sif.nix

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
}:
88

99
let
10-
optionalInt = cond: x: if cond then x else 0;
10+
optionalInt = cond: x: if cond then x else 0;
1111
in
1212
singularity-tools.buildImage rec {
1313
inherit (llama-cpp) name;

.github/workflows/build.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,28 @@ jobs:
145145
cd build
146146
ctest -L main --verbose
147147
148+
ubuntu-22-cmake-vulkan:
149+
runs-on: ubuntu-22.04
150+
151+
steps:
152+
- name: Clone
153+
id: checkout
154+
uses: actions/checkout@v3
155+
156+
- name: Dependencies
157+
id: depends
158+
run: |
159+
sudo apt-get update
160+
sudo apt-get install build-essential libvulkan-dev
161+
162+
- name: Build
163+
id: cmake_build
164+
run: |
165+
mkdir build
166+
cd build
167+
cmake -DLLAMA_VULKAN=ON ..
168+
cmake --build . --config Release -j $(nproc)
169+
148170
ubuntu-22-cmake-sycl:
149171
runs-on: ubuntu-22.04
150172

.github/workflows/python-check-requirements.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,14 @@ name: Python check requirements.txt
33
on:
44
push:
55
paths:
6+
- '.github/workflows/python-check-requirements.yml'
67
- 'scripts/check-requirements.sh'
78
- 'convert*.py'
89
- 'requirements.txt'
910
- 'requirements/*.txt'
1011
pull_request:
1112
paths:
13+
- '.github/workflows/python-check-requirements.yml'
1214
- 'scripts/check-requirements.sh'
1315
- 'convert*.py'
1416
- 'requirements.txt'
@@ -26,4 +28,4 @@ jobs:
2628
with:
2729
python-version: "3.11"
2830
- name: Run check-requirements.sh script
29-
run: bash scripts/check-requirements.sh nocleanup
31+
run: bash scripts/check-requirements.sh

.github/workflows/server.yml

Lines changed: 33 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -3,60 +3,42 @@ name: Server
33

44
on:
55
workflow_dispatch: # allows manual triggering
6+
inputs:
7+
slow_tests:
8+
description: 'Run slow tests'
9+
required: true
10+
type: boolean
611
push:
712
branches:
813
- master
9-
- test/server-add-ci-test # FIXME remove
10-
paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
14+
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
1115
pull_request:
1216
types: [opened, synchronize, reopened]
13-
paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/**.*']
17+
paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*']
18+
schedule:
19+
- cron: '0 0 * * *'
1420

1521
jobs:
1622
server:
1723
runs-on: ubuntu-latest
1824

1925
strategy:
2026
matrix:
21-
build: [noavx, avx2, avx, avx512, cublas, clblast, openblas, kompute, vulkan]
2227
sanitizer: [ADDRESS, THREAD, UNDEFINED]
2328
build_type: [Debug, Release]
2429
include:
25-
- build: 'noavx'
26-
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF'
27-
image: ubuntu:latest
28-
- build: 'avx2'
29-
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
30-
image: ubuntu:latest
31-
- build: 'avx'
32-
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX2=OFF'
33-
image: ubuntu:latest
34-
- build: 'avx512'
35-
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_AVX512=ON'
36-
image: ubuntu:latest
37-
experimental: true
38-
- build: 'cublas'
39-
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CUBLAS=ON'
40-
image: nvidia/cuda:12.3.1-devel-ubuntu22.04
41-
arch_not_available: true # require nvidia docker engine
42-
- build: 'clblast'
43-
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_CLBLAST=ON'
44-
image: ubuntu:latest
45-
arch_not_available: true
46-
- build: 'openblas'
47-
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS'
48-
image: ubuntu:latest
49-
- build: 'kompute'
50-
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
51-
image: ubuntu:latest
52-
arch_not_available: true
53-
- build: 'vulkan'
54-
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON'
55-
image: ubuntu:latest
56-
arch_not_available: true
30+
- build_type: Release
31+
sanitizer: ""
32+
exclude:
33+
- build_type: Release
34+
sanitizer: ADDRESS
35+
- build_type: Release
36+
sanitizer: THREAD
37+
- build_type: Release
38+
sanitizer: UNDEFINED
5739

5840
container:
59-
image: ${{ matrix.image }}
41+
image: ubuntu:latest
6042
ports:
6143
- 8888
6244
options: --cpus 4
@@ -72,56 +54,39 @@ jobs:
7254
apt-get update
7355
apt-get -y install \
7456
build-essential \
75-
pkg-config \
7657
git \
7758
cmake \
7859
python3-pip \
7960
wget \
80-
psmisc
81-
82-
- name: Download CLBlast
83-
id: get_clblast
84-
if: ${{ matrix.build == 'clblast' }}
85-
run: |
86-
apt install -y libclblast-dev
87-
88-
- name: Download OpenBLAS
89-
id: get_openblas
90-
if: ${{ matrix.build == 'openblas' }}
91-
run: |
92-
apt-get -y install libopenblas-dev
93-
94-
- name: Install Vulkan SDK
95-
id: get_vulkan
96-
if: ${{ matrix.build == 'kompute' || matrix.build == 'vulkan' }}
97-
run: |
98-
wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | tee /etc/apt/trusted.gpg.d/lunarg.asc
99-
wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
100-
apt-get update
101-
apt-get -y install vulkan-sdk
61+
psmisc \
62+
language-pack-en
10263
10364
- name: Build
10465
id: cmake_build
10566
run: |
10667
mkdir build
10768
cd build
108-
cmake .. -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} ${{ matrix.defines }}
69+
cmake .. \
70+
-DLLAMA_NATIVE=OFF \
71+
-DLLAMA_BUILD_SERVER=ON \
72+
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
73+
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
10974
cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
11075
11176
- name: Tests dependencies
11277
id: test_dependencies
11378
run: |
11479
pip install -r examples/server/tests/requirements.txt
11580
116-
- name: Download models
117-
id: download_models
81+
- name: Tests
82+
id: server_integration_tests
11883
run: |
11984
cd examples/server/tests
120-
../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf
85+
PORT=8888 ./tests.sh
12186
122-
- name: Tests
123-
id: server_integration_test
124-
continue-on-error: ${{ matrix.experimental || matrix.arch_not_available }}
87+
- name: Slow tests
88+
id: server_integration_tests_slow
89+
if: ${{ github.event.schedule != '' && matrix.build_type == 'Release' || github.event.inputs.slow_tests == 'true' }}
12590
run: |
12691
cd examples/server/tests
127-
PORT=8888 ./tests.sh
92+
PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow

Makefile

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -381,8 +381,13 @@ ifdef LLAMA_BLIS
381381
endif # LLAMA_BLIS
382382

383383
ifdef LLAMA_CUBLAS
384-
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include -I/usr/local/cuda/targets/aarch64-linux/include
385-
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib -L/usr/local/cuda/targets/aarch64-linux/lib -L/usr/lib/wsl/lib
384+
ifneq ('', '$(wildcard /opt/cuda)')
385+
CUDA_PATH ?= /opt/cuda
386+
else
387+
CUDA_PATH ?= /usr/local/cuda
388+
endif
389+
MK_CPPFLAGS += -DGGML_USE_CUBLAS -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
390+
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
386391
OBJS += ggml-cuda.o
387392
MK_NVCCFLAGS += -use_fast_math
388393
ifdef LLAMA_FATAL_WARNINGS
@@ -719,10 +724,9 @@ save-load-state: examples/save-load-state/save-load-state.cpp ggml.o llama.o $(C
719724
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
720725
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
721726

722-
server: examples/server/server.cpp examples/server/oai.hpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp examples/llava/clip.cpp examples/llava/clip.h examples/llava/llava.h examples/llava/llava.cpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
727+
server: examples/server/server.cpp examples/server/utils.hpp examples/server/httplib.h examples/server/json.hpp examples/server/index.html.hpp examples/server/index.js.hpp examples/server/completion.js.hpp common/stb_image.h ggml.o llama.o $(COMMON_DEPS) grammar-parser.o $(OBJS)
723728
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
724-
$(CXX) $(CXXFLAGS) -c examples/llava/clip.cpp -o $(call GET_OBJ_FILE, examples/llava/clip.cpp) -Wno-cast-qual
725-
$(CXX) $(CXXFLAGS) -Iexamples/server $(filter-out %.h %.hpp $< examples/llava/clip.cpp,$^) $(call GET_OBJ_FILE, $<) $(call GET_OBJ_FILE, examples/llava/clip.cpp) -o $@ $(LDFLAGS) $(LWINSOCK2)
729+
$(CXX) $(CXXFLAGS) $(filter-out %.h %.hpp $<,$^) -Iexamples/server $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LWINSOCK2)
726730

727731
gguf: examples/gguf/gguf.cpp ggml.o $(OBJS)
728732
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)

README-sycl.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# llama.cpp for SYCL
22

33
- [Background](#background)
4+
- [News](#news)
45
- [OS](#os)
56
- [Intel GPU](#intel-gpu)
67
- [Docker](#docker)
@@ -25,6 +26,21 @@ The llama.cpp for SYCL is used to support Intel GPUs.
2526

2627
For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building).
2728

29+
## News
30+
31+
- 2024.3
32+
- Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing.
33+
- Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE.
34+
- Support detecting all GPUs with level-zero and same top **Max compute units**.
35+
- Support OPs
36+
- hardsigmoid
37+
- hardswish
38+
- pool2d
39+
40+
- 2024.1
41+
- Create SYCL backend for Intel GPU.
42+
- Support Windows build
43+
2844
## OS
2945

3046
|OS|Status|Verified|
@@ -449,6 +465,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
449465
|-|-|-|
450466
|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output|
451467
|GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG|
468+
|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer|
452469

453470
## Known Issue
454471

@@ -458,6 +475,10 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device
458475

459476
Solution: add **--no-mmap** or **--mmap 0**.
460477

478+
- Split-mode: [row] is not supported
479+
480+
It's on developing.
481+
461482
## Q&A
462483

463484
- Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`.

README.md

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,14 @@
88

99
Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) in pure C/C++
1010

11+
### Recent API changes
12+
13+
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
14+
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
15+
1116
### Hot topics
1217

18+
- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761))
1319
- Support for chat templates: [Uncyclo (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
1420
- Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631
1521
- Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590
@@ -107,7 +113,7 @@ Typically finetunes of the base models below are supported as well.
107113

108114
**Multimodal models:**
109115

110-
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e)
116+
- [x] [LLaVA 1.5 models](https://huggingface.co/collections/liuhaotian/llava-15-653aac15d994e992e2677a7e), [LLaVA 1.6 models](https://huggingface.co/collections/liuhaotian/llava-16-65b9e40155f60fd046a5ccf2)
111117
- [x] [BakLLaVA](https://huggingface.co/models?search=SkunkworksAI/Bakllava)
112118
- [x] [Obsidian](https://huggingface.co/NousResearch/Obsidian-3B-V0.5)
113119
- [x] [ShareGPT4V](https://huggingface.co/models?search=Lin-Chen/ShareGPT4V)
@@ -159,6 +165,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
159165
- [withcatai/catai](https://github.com/withcatai/catai)
160166
- [Mobile-Artificial-Intelligence/maid](https://github.com/Mobile-Artificial-Intelligence/maid) (MIT)
161167
- [Msty](https://msty.app) (proprietary)
168+
- [LLMFarm](https://github.com/guinmoon/LLMFarm?tab=readme-ov-file) (MIT)
162169

163170
---
164171

@@ -784,7 +791,7 @@ And after 4.45 hours, you will have the final perplexity.
784791
### Interactive mode
785792
786793
If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter.
787-
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
794+
In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`.
788795
789796
Here is an example of a few-shot interaction, invoked with the command
790797
@@ -848,7 +855,7 @@ Sample run:
848855
```
849856
== Running in interactive mode. ==
850857
- Press Ctrl+C to interject at any time.
851-
- Press Return to return control to LLaMa.
858+
- Press Return to return control to LLaMA.
852859
- If you want to submit another line, end your input in '\'.
853860
854861
Below is an instruction that describes a task. Write a response that appropriately completes the request.

0 commit comments

Comments
 (0)