Skip to content

Commit aa01153

Browse files
authored
Merge branch 'master' into gguf-py-contents
2 parents 9ab2f4a + 3567ee3 commit aa01153

File tree

138 files changed

+8495
-3510
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+8495
-3510
lines changed

.github/workflows/build.yml

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,15 @@ jobs:
173173
name: llama-bin-macos-x64.zip
174174

175175
ubuntu-cpu-cmake:
176-
runs-on: ubuntu-22.04
176+
strategy:
177+
matrix:
178+
include:
179+
- build: 'x64'
180+
os: ubuntu-22.04
181+
- build: 'arm64'
182+
os: ubuntu-22.04-arm
183+
184+
runs-on: ${{ matrix.os }}
177185

178186
steps:
179187
- name: Clone
@@ -239,14 +247,14 @@ jobs:
239247
run: |
240248
cp LICENSE ./build/bin/
241249
cp examples/run/linenoise.cpp/LICENSE ./build/bin/LICENSE.linenoise.cpp
242-
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip ./build/bin/*
250+
zip -r llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip ./build/bin/*
243251
244252
- name: Upload artifacts
245253
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
246254
uses: actions/upload-artifact@v4
247255
with:
248-
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-x64.zip
249-
name: llama-bin-ubuntu-x64.zip
256+
path: llama-${{ steps.tag.outputs.name }}-bin-ubuntu-${{ matrix.build }}.zip
257+
name: llama-bin-ubuntu-${{ matrix.build }}.zip
250258

251259
ubuntu-latest-cmake-sanitizer:
252260
runs-on: ubuntu-latest
@@ -374,6 +382,8 @@ jobs:
374382
- name: Clone
375383
id: checkout
376384
uses: actions/checkout@v4
385+
with:
386+
fetch-depth: 0
377387

378388
- name: ccache
379389
uses: hendrikmuhs/[email protected]
@@ -1373,8 +1383,10 @@ jobs:
13731383

13741384
needs:
13751385
- ubuntu-cpu-cmake
1386+
- ubuntu-22-cmake-vulkan
13761387
- windows-latest-cmake
13771388
- windows-2019-cmake-cuda
1389+
- windows-latest-cmake-sycl
13781390
- windows-latest-cmake-hip-release
13791391
- macOS-latest-cmake-arm64
13801392
- macOS-latest-cmake-x64

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ examples/server/*.css.hpp
9898
examples/server/*.html.hpp
9999
examples/server/*.js.hpp
100100
examples/server/*.mjs.hpp
101+
examples/server/*.gz.hpp
101102
!build_64.sh
102103
!examples/*.bat
103104
!examples/*/*.kts

CONTRIBUTING.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
# Pull requests (for contributors)
22

3+
- llama.cpp uses the ggml tensor library for model evaluation. If you are unfamiliar with ggml, consider taking a look at the [examples in the ggml repository](https://github.com/ggml-org/ggml/tree/master/examples/). [simple](https://github.com/ggml-org/ggml/tree/master/examples/simple) shows the bare minimum for using ggml. [gpt-2](https://github.com/ggml-org/ggml/tree/master/examples/gpt-2) has minimal implementations for language model inference using GPT-2. [mnist](https://github.com/ggml-org/ggml/tree/master/examples/mnist) demonstrates how to train and evaluate a simple image classifier
34
- Test your changes:
45
- Execute [the full CI locally on your machine](ci/README.md) before publishing
56
- Verify that the perplexity and the performance are not affected negatively by your changes (use `llama-perplexity` and `llama-bench`)
67
- If you modified the `ggml` source, run the `test-backend-ops` tool to check whether different backend implementations of the `ggml` operators produce consistent results (this requires access to at least two different `ggml` backends)
78
- If you modified a `ggml` operator or added a new one, add the corresponding test cases to `test-backend-ops`
9+
- Create separate PRs for each feature or fix. Avoid combining unrelated changes in a single PR
810
- Consider allowing write access to your branch for faster reviews, as reviewers can push commits directly
911
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
1012

Makefile

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -680,6 +680,10 @@ ifdef GGML_CUDA_CCBIN
680680
MK_NVCCFLAGS += -ccbin $(GGML_CUDA_CCBIN)
681681
endif # GGML_CUDA_CCBIN
682682

683+
ifdef GGML_CUDA_NO_FA
684+
MK_NVCCFLAGS += -DGGML_CUDA_NO_FA
685+
endif # GGML_CUDA_NO_FA
686+
683687
ifdef GGML_CUDA_FA_ALL_QUANTS
684688
MK_NVCCFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
685689
endif # GGML_CUDA_FA_ALL_QUANTS
@@ -800,6 +804,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
800804
HIPFLAGS += -DGGML_CUDA_NO_PEER_COPY
801805
endif # GGML_CUDA_NO_PEER_COPY
802806

807+
ifdef GGML_CUDA_NO_FA
808+
HIPFLAGS += -DGGML_CUDA_NO_FA
809+
endif # GGML_CUDA_NO_FA
810+
803811
OBJ_GGML_EXT += ggml/src/ggml-cuda/ggml-cuda.o
804812
OBJ_GGML_EXT += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
805813
OBJ_GGML_EXT += $(OBJ_CUDA_TMPL)
@@ -847,7 +855,7 @@ ifdef GGML_MUSA
847855
CXX := $(MUSA_PATH)/bin/clang++
848856
MCC := $(CCACHE) $(MUSA_PATH)/bin/mcc
849857

850-
MUSAFLAGS = -x musa -mtgpu
858+
MUSAFLAGS = -fsigned-char -x musa -mtgpu
851859
MUSAFLAGS += $(foreach arch,$(subst ;, ,$(MUSA_ARCHITECTURES)),--cuda-gpu-arch=mp_$(arch))
852860

853861
ifdef GGML_CUDA_FORCE_MMQ
@@ -876,6 +884,10 @@ ifdef GGML_CUDA_NO_PEER_COPY
876884
MUSAFLAGS += -DGGML_CUDA_NO_PEER_COPY
877885
endif # GGML_CUDA_NO_PEER_COPY
878886

887+
ifdef GGML_CUDA_NO_FA
888+
MUSAFLAGS += -DGGML_CUDA_NO_FA
889+
endif # GGML_CUDA_NO_FA
890+
879891
ifdef GGML_CUDA_FA_ALL_QUANTS
880892
MUSAFLAGS += -DGGML_CUDA_FA_ALL_QUANTS
881893
endif # GGML_CUDA_FA_ALL_QUANTS
@@ -1364,7 +1376,7 @@ llama-server: \
13641376
examples/server/index.html.hpp \
13651377
examples/server/loading.html.hpp \
13661378
common/chat.cpp \
1367-
common/chat.hpp \
1379+
common/chat.h \
13681380
common/chat-template.hpp \
13691381
common/json.hpp \
13701382
common/minja.hpp \

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
219219
- [llama_cpp_canister](https://github.com/onicai/llama_cpp_canister) - llama.cpp as a smart contract on the Internet Computer, using WebAssembly
220220
- [llama-swap](https://github.com/mostlygeek/llama-swap) - transparent proxy that adds automatic model switching with llama-server
221221
- [Kalavai](https://github.com/kalavai-net/kalavai-client) - Crowdsource end to end LLM deployment at any scale
222-
222+
- [llmaz](https://github.com/InftyAI/llmaz) - ☸️ Easy, advanced inference platform for large language models on Kubernetes.
223223
</details>
224224

225225
<details>

common/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ add_library(${TARGET} STATIC
5757
arg.h
5858
base64.hpp
5959
chat.cpp
60-
chat.hpp
61-
chat-template.hpp
60+
chat.h
6261
common.cpp
6362
common.h
6463
console.cpp
@@ -68,7 +67,8 @@ add_library(${TARGET} STATIC
6867
llguidance.cpp
6968
log.cpp
7069
log.h
71-
minja.hpp
70+
minja/chat-template.hpp
71+
minja/minja.hpp
7272
ngram-cache.cpp
7373
ngram-cache.h
7474
sampling.cpp

common/arg.cpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "log.h"
44
#include "sampling.h"
5+
#include "chat.h"
56

67
#include <algorithm>
78
#include <climits>
@@ -2501,5 +2502,53 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
25012502
}
25022503
).set_examples({LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_SERVER}));
25032504

2505+
add_opt(common_arg(
2506+
{"--fim-qwen-1.5b-default"},
2507+
string_format("use default Qwen 2.5 Coder 1.5B (note: can download weights from the internet)"),
2508+
[](common_params & params) {
2509+
params.hf_repo = "ggml-org/Qwen2.5-Coder-1.5B-Q8_0-GGUF";
2510+
params.hf_file = "qwen2.5-coder-1.5b-q8_0.gguf";
2511+
params.port = 8012;
2512+
params.n_gpu_layers = 99;
2513+
params.flash_attn = true;
2514+
params.n_ubatch = 1024;
2515+
params.n_batch = 1024;
2516+
params.n_ctx = 0;
2517+
params.n_cache_reuse = 256;
2518+
}
2519+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2520+
2521+
add_opt(common_arg(
2522+
{"--fim-qwen-3b-default"},
2523+
string_format("use default Qwen 2.5 Coder 3B (note: can download weights from the internet)"),
2524+
[](common_params & params) {
2525+
params.hf_repo = "ggml-org/Qwen2.5-Coder-3B-Q8_0-GGUF";
2526+
params.hf_file = "qwen2.5-coder-3b-q8_0.gguf";
2527+
params.port = 8012;
2528+
params.n_gpu_layers = 99;
2529+
params.flash_attn = true;
2530+
params.n_ubatch = 1024;
2531+
params.n_batch = 1024;
2532+
params.n_ctx = 0;
2533+
params.n_cache_reuse = 256;
2534+
}
2535+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2536+
2537+
add_opt(common_arg(
2538+
{"--fim-qwen-7b-default"},
2539+
string_format("use default Qwen 2.5 Coder 7B (note: can download weights from the internet)"),
2540+
[](common_params & params) {
2541+
params.hf_repo = "ggml-org/Qwen2.5-Coder-7B-Q8_0-GGUF";
2542+
params.hf_file = "qwen2.5-coder-7b-q8_0.gguf";
2543+
params.port = 8012;
2544+
params.n_gpu_layers = 99;
2545+
params.flash_attn = true;
2546+
params.n_ubatch = 1024;
2547+
params.n_batch = 1024;
2548+
params.n_ctx = 0;
2549+
params.n_cache_reuse = 256;
2550+
}
2551+
).set_examples({LLAMA_EXAMPLE_SERVER}));
2552+
25042553
return ctx_arg;
25052554
}

0 commit comments

Comments
 (0)