Skip to content

Commit dd6495d

Browse files
committed
Merge branch 'master' into compilade/readonly-recurrent-inputs
2 parents 3129639 + f470bc3 commit dd6495d

File tree

142 files changed

+29660
-19863
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

142 files changed

+29660
-19863
lines changed

.github/labeler.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,3 +86,10 @@ nix:
8686
embedding:
8787
- changed-files:
8888
- any-glob-to-any-file: examples/embedding/
89+
90+
Ascend NPU:
91+
- changed-files:
92+
- any-glob-to-any-file:
93+
- ggml/include/ggml-cann.h
94+
- ggml/src/ggml-cann/**
95+
- docs/backend/CANN.md

.github/workflows/build-linux-cross.yml

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,3 +231,116 @@ jobs:
231231
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
232232
233233
cmake --build build --config Release -j $(nproc)
234+
235+
debian-13-loongarch64-cpu-cross:
236+
runs-on: ubuntu-24.04
237+
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
238+
239+
steps:
240+
- uses: actions/checkout@v4
241+
- name: Setup LoongArch
242+
run: |
243+
rm -f /etc/apt/sources.list.d/*
244+
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
245+
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
246+
EOF
247+
( echo 'quiet "true";'; \
248+
echo 'APT::Get::Assume-Yes "true";'; \
249+
echo 'APT::Install-Recommends "false";'; \
250+
echo 'Acquire::Check-Valid-Until "false";'; \
251+
echo 'Acquire::Retries "5";'; \
252+
) > /etc/apt/apt.conf.d/99snapshot-repos
253+
254+
apt-get update
255+
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
256+
dpkg --add-architecture loong64
257+
258+
# Add arch-specific repositories for non-amd64 architectures
259+
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
260+
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
261+
EOF
262+
263+
apt-get update || true ;# Prevent failure due to missing URLs.
264+
265+
apt-get install -y --no-install-recommends \
266+
build-essential \
267+
gcc-14-loongarch64-linux-gnu \
268+
g++-14-loongarch64-linux-gnu
269+
270+
- name: Build
271+
run: |
272+
cmake -B build -DLLAMA_CURL=OFF \
273+
-DCMAKE_BUILD_TYPE=Release \
274+
-DGGML_OPENMP=OFF \
275+
-DLLAMA_BUILD_EXAMPLES=ON \
276+
-DLLAMA_BUILD_TOOLS=ON \
277+
-DLLAMA_BUILD_TESTS=OFF \
278+
-DCMAKE_SYSTEM_NAME=Linux \
279+
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
280+
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
281+
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
282+
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
283+
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
284+
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
285+
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
286+
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
287+
288+
cmake --build build --config Release -j $(nproc)
289+
290+
debian-13-loongarch64-vulkan-cross:
291+
runs-on: ubuntu-24.04
292+
container: debian@sha256:653dfb9f86c3782e8369d5f7d29bb8faba1f4bff9025db46e807fa4c22903671
293+
294+
steps:
295+
- uses: actions/checkout@v4
296+
- name: Setup LoongArch
297+
run: |
298+
rm -f /etc/apt/sources.list.d/*
299+
cat << EOF | tee /etc/apt/sources.list.d/debian-ports.list
300+
deb http://snapshot.debian.org/archive/debian/20250515T202920Z/ trixie main
301+
EOF
302+
( echo 'quiet "true";'; \
303+
echo 'APT::Get::Assume-Yes "true";'; \
304+
echo 'APT::Install-Recommends "false";'; \
305+
echo 'Acquire::Check-Valid-Until "false";'; \
306+
echo 'Acquire::Retries "5";'; \
307+
) > /etc/apt/apt.conf.d/99snapshot-repos
308+
309+
apt-get update
310+
apt-get install -y ca-certificates debian-ports-archive-keyring cmake git zip
311+
dpkg --add-architecture loong64
312+
313+
# Add arch-specific repositories for non-amd64 architectures
314+
cat << EOF | tee /etc/apt/sources.list.d/loong64-ports.list
315+
deb [arch=loong64] http://snapshot.debian.org/archive/debian-ports/20250515T194251Z/ sid main
316+
EOF
317+
318+
apt-get update || true ;# Prevent failure due to missing URLs.
319+
320+
apt-get install -y --no-install-recommends \
321+
build-essential \
322+
glslc \
323+
gcc-14-loongarch64-linux-gnu \
324+
g++-14-loongarch64-linux-gnu \
325+
libvulkan-dev:loong64
326+
327+
- name: Build
328+
run: |
329+
cmake -B build -DLLAMA_CURL=OFF \
330+
-DCMAKE_BUILD_TYPE=Release \
331+
-DGGML_VULKAN=ON \
332+
-DGGML_OPENMP=OFF \
333+
-DLLAMA_BUILD_EXAMPLES=ON \
334+
-DLLAMA_BUILD_TOOLS=ON \
335+
-DLLAMA_BUILD_TESTS=OFF \
336+
-DCMAKE_SYSTEM_NAME=Linux \
337+
-DCMAKE_SYSTEM_PROCESSOR=loongarch64 \
338+
-DCMAKE_C_COMPILER=loongarch64-linux-gnu-gcc-14 \
339+
-DCMAKE_CXX_COMPILER=loongarch64-linux-gnu-g++-14 \
340+
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
341+
-DCMAKE_FIND_ROOT_PATH=/usr/lib/loongarch64-linux-gnu \
342+
-DCMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
343+
-DCMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
344+
-DCMAKE_FIND_ROOT_PATH_MODE_INCLUDE=BOTH
345+
346+
cmake --build build --config Release -j $(nproc)

.github/workflows/build.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -839,12 +839,12 @@ jobs:
839839
-DGGML_CUDA=ON
840840
cmake --build build
841841
842-
windows-2019-cmake-cuda:
843-
runs-on: windows-2019
842+
windows-2022-cmake-cuda:
843+
runs-on: windows-2022
844844

845845
strategy:
846846
matrix:
847-
cuda: ['12.4', '11.7']
847+
cuda: ['12.4']
848848

849849
steps:
850850
- name: Clone
@@ -878,7 +878,7 @@ jobs:
878878
env:
879879
CURL_PATH: ${{ steps.get_libcurl.outputs.curl_path }}
880880
run: |
881-
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
881+
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
882882
cmake -S . -B build -G "Ninja Multi-Config" ^
883883
-DLLAMA_BUILD_SERVER=ON ^
884884
-DGGML_NATIVE=OFF ^

.github/workflows/release.yml

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,9 @@ jobs:
131131
include:
132132
- build: 'x64'
133133
os: ubuntu-22.04
134-
- build: 'arm64'
135-
os: ubuntu-22.04-arm
134+
# GGML_BACKEND_DL and GGML_CPU_ALL_VARIANTS are not currently supported on arm
135+
# - build: 'arm64'
136+
# os: ubuntu-22.04-arm
136137

137138
runs-on: ${{ matrix.os }}
138139

@@ -159,6 +160,9 @@ jobs:
159160
id: cmake_build
160161
run: |
161162
cmake -B build \
163+
-DGGML_BACKEND_DL=ON \
164+
-DGGML_NATIVE=OFF \
165+
-DGGML_CPU_ALL_VARIANTS=ON \
162166
-DLLAMA_FATAL_WARNINGS=ON \
163167
${{ env.CMAKE_ARGS }}
164168
cmake --build build --config Release -j $(nproc)
@@ -207,6 +211,9 @@ jobs:
207211
id: cmake_build
208212
run: |
209213
cmake -B build \
214+
-DGGML_BACKEND_DL=ON \
215+
-DGGML_NATIVE=OFF \
216+
-DGGML_CPU_ALL_VARIANTS=ON \
210217
-DGGML_VULKAN=ON \
211218
${{ env.CMAKE_ARGS }}
212219
cmake --build build --config Release -j $(nproc)
@@ -373,11 +380,11 @@ jobs:
373380
name: llama-bin-win-${{ matrix.backend }}-${{ matrix.arch }}.zip
374381

375382
windows-cuda:
376-
runs-on: windows-2019
383+
runs-on: windows-2022
377384

378385
strategy:
379386
matrix:
380-
cuda: ['12.4', '11.7']
387+
cuda: ['12.4']
381388

382389
steps:
383390
- name: Clone
@@ -405,7 +412,7 @@ jobs:
405412
id: cmake_build
406413
shell: cmd
407414
run: |
408-
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
415+
call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvarsall.bat" x64
409416
cmake -S . -B build -G "Ninja Multi-Config" ^
410417
-DGGML_BACKEND_DL=ON ^
411418
-DGGML_NATIVE=OFF ^

.github/workflows/server.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -180,7 +180,7 @@ jobs:
180180
181181
182182
server-windows:
183-
runs-on: windows-2019
183+
runs-on: windows-2022
184184

185185
steps:
186186
- name: Clone

CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,11 @@ if (NOT TARGET ggml AND NOT LLAMA_USE_SYSTEM_GGML)
159159
# ... otherwise assume ggml is added by a parent CMakeLists.txt
160160
endif()
161161

162+
if (MINGW)
163+
# Target Windows 8 for PrefetchVirtualMemory
164+
add_compile_definitions(_WIN32_WINNT=${GGML_WIN_VER})
165+
endif()
166+
162167
#
163168
# build the library
164169
#

Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ ifdef LLAMA_SERVER_SSL
367367
endif
368368

369369
ifndef GGML_NO_CPU_AARCH64
370-
MK_CPPFLAGS += -DGGML_USE_CPU_AARCH64
370+
MK_CPPFLAGS += -DGGML_USE_CPU_REPACK
371371
endif
372372

373373
# warnings
@@ -970,7 +970,7 @@ OBJ_GGML = \
970970
$(DIR_GGML)/src/ggml-threading.o \
971971
$(DIR_GGML)/src/ggml-cpu/ggml-cpu.o \
972972
$(DIR_GGML)/src/ggml-cpu/ggml-cpu_cpp.o \
973-
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-aarch64.o \
973+
$(DIR_GGML)/src/ggml-cpu/repack.o \
974974
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-hbm.o \
975975
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-quants.o \
976976
$(DIR_GGML)/src/ggml-cpu/ggml-cpu-traits.o \

README.md

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
44

55
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
6+
[![Release](https://img.shields.io/github/v/release/ggml-org/llama.cpp)](https://github.com/ggml-org/llama.cpp/releases)
67
[![Server](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml/badge.svg)](https://github.com/ggml-org/llama.cpp/actions/workflows/server.yml)
78

89
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggml-org/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggml-org/llama.cpp/discussions/205) / [ggml](https://github.com/ggml-org/ggml)
@@ -28,6 +29,30 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
2829

2930
----
3031

32+
## Quick start
33+
34+
Getting started with llama.cpp is straightforward. Here are several ways to install it on your machine:
35+
36+
- Install `llama.cpp` using [brew, nix or winget](docs/install.md)
37+
- Run with Docker - see our [Docker documentation](docs/docker.md)
38+
- Download pre-built binaries from the [releases page](https://github.com/ggml-org/llama.cpp/releases)
39+
- Build from source by cloning this repository - check out [our build guide](docs/build.md)
40+
41+
Once installed, you'll need a model to work with. Head to the [Obtaining and quantizing models](#obtaining-and-quantizing-models) section to learn more.
42+
43+
Example command:
44+
45+
```sh
46+
# Use a local model file
47+
llama-cli -m my_model.gguf
48+
49+
# Or download and run a model directly from Hugging Face
50+
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
51+
52+
# Launch OpenAI-compatible API server
53+
llama-server -hf ggml-org/gemma-3-1b-it-GGUF
54+
```
55+
3156
## Description
3257

3358
The main goal of `llama.cpp` is to enable LLM inference with minimal setup and state-of-the-art performance on a wide
@@ -130,6 +155,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
130155
<details>
131156
<summary>Bindings</summary>
132157

158+
- Python: [ddh0/easy-llama](https://github.com/ddh0/easy-llama)
133159
- Python: [abetlen/llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
134160
- Go: [go-skynet/go-llama.cpp](https://github.com/go-skynet/go-llama.cpp)
135161
- Node.js: [withcatai/node-llama-cpp](https://github.com/withcatai/node-llama-cpp)
@@ -229,6 +255,7 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
229255

230256
</details>
231257

258+
232259
## Supported backends
233260

234261
| Backend | Target devices |
@@ -245,24 +272,18 @@ Instructions for adding support for new models: [HOWTO-add-model.md](docs/develo
245272
| [OpenCL](docs/backend/OPENCL.md) | Adreno GPU |
246273
| [RPC](https://github.com/ggml-org/llama.cpp/tree/master/tools/rpc) | All |
247274

248-
## Building the project
249-
250-
The main product of this project is the `llama` library. Its C-style interface can be found in [include/llama.h](include/llama.h).
251-
The project also includes many example programs and tools using the `llama` library. The examples range from simple, minimal code snippets to sophisticated sub-projects such as an OpenAI-compatible HTTP server. Possible methods for obtaining the binaries:
252-
253-
- Clone this repository and build locally, see [how to build](docs/build.md)
254-
- On MacOS or Linux, install `llama.cpp` via [brew, flox or nix](docs/install.md)
255-
- Use a Docker image, see [documentation for Docker](docs/docker.md)
256-
- Download pre-built binaries from [releases](https://github.com/ggml-org/llama.cpp/releases)
257-
258275
## Obtaining and quantizing models
259276

260277
The [Hugging Face](https://huggingface.co) platform hosts a [number of LLMs](https://huggingface.co/models?library=gguf&sort=trending) compatible with `llama.cpp`:
261278

262279
- [Trending](https://huggingface.co/models?library=gguf&sort=trending)
263280
- [LLaMA](https://huggingface.co/models?sort=trending&search=llama+gguf)
264281

265-
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`.
282+
You can either manually download the GGUF file or directly use any `llama.cpp`-compatible models from [Hugging Face](https://huggingface.co/) or other model hosting sites, such as [ModelScope](https://modelscope.cn/), by using this CLI argument: `-hf <user>/<model>[:quant]`. For example:
283+
284+
```sh
285+
llama-cli -hf ggml-org/gemma-3-1b-it-GGUF
286+
```
266287

267288
By default, the CLI would download from Hugging Face, you can switch to other options with the environment variable `MODEL_ENDPOINT`. For example, you may opt to downloading model checkpoints from ModelScope or other model sharing communities by setting the environment variable, e.g. `MODEL_ENDPOINT=https://www.modelscope.cn/`.
268289

ci/run.sh

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,20 @@ if [ ! -z ${GG_BUILD_METAL} ]; then
4646
fi
4747

4848
if [ ! -z ${GG_BUILD_CUDA} ]; then
49-
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=native"
49+
CMAKE_EXTRA="${CMAKE_EXTRA} -DGGML_CUDA=ON"
50+
51+
if command -v nvidia-smi >/dev/null 2>&1; then
52+
CUDA_ARCH=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader,nounits 2>/dev/null | head -1 | tr -d '.')
53+
if [[ -n "$CUDA_ARCH" && "$CUDA_ARCH" =~ ^[0-9]+$ ]]; then
54+
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=${CUDA_ARCH}"
55+
else
56+
echo "Warning: Using fallback CUDA architectures"
57+
CMAKE_EXTRA="${CMAKE_EXTRA} -DCMAKE_CUDA_ARCHITECTURES=61;70;75;80;86;89"
58+
fi
59+
else
60+
echo "Error: nvidia-smi not found, cannot build with CUDA"
61+
exit 1
62+
fi
5063
fi
5164

5265
if [ ! -z ${GG_BUILD_SYCL} ]; then

common/arg.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
13481348
));
13491349
add_opt(common_arg(
13501350
{"--prio"}, "N",
1351-
string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority),
1351+
string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority),
13521352
[](common_params & params, int prio) {
1353-
if (prio < 0 || prio > 3) {
1353+
if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) {
13541354
throw std::invalid_argument("invalid value");
13551355
}
13561356
params.cpuparams.priority = (enum ggml_sched_priority) prio;
@@ -2869,6 +2869,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
28692869
"(default: deepseek)",
28702870
[](common_params & params, const std::string & value) {
28712871
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
2872+
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
28722873
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
28732874
else { throw std::invalid_argument("invalid value"); }
28742875
}

0 commit comments

Comments
 (0)