Skip to content

Commit 0959cc1

Browse files
committed
Merge branch 'master' into xsn/vision_2
2 parents b72d755 + 466ea66 commit 0959cc1

File tree

21 files changed

+546
-200
lines changed

21 files changed

+546
-200
lines changed

.github/workflows/build.yml

Lines changed: 86 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ jobs:
5656
mkdir build
5757
cd build
5858
cmake .. \
59+
-DCMAKE_BUILD_RPATH="@loader_path" \
5960
-DLLAMA_FATAL_WARNINGS=ON \
6061
-DLLAMA_CURL=ON \
6162
-DGGML_METAL_USE_BF16=ON \
@@ -120,6 +121,7 @@ jobs:
120121
# Metal is disabled due to intermittent failures with Github runners not having a GPU:
121122
# https://github.com/ggerganov/llama.cpp/actions/runs/8635935781/job/23674807267#step:5:2313
122123
cmake -B build \
124+
-DCMAKE_BUILD_RPATH="@loader_path" \
123125
-DLLAMA_FATAL_WARNINGS=ON \
124126
-DLLAMA_CURL=ON \
125127
-DGGML_METAL=OFF \
@@ -160,8 +162,8 @@ jobs:
160162
path: llama-${{ steps.tag.outputs.name }}-bin-macos-x64.zip
161163
name: llama-bin-macos-x64.zip
162164

163-
ubuntu-latest-cmake:
164-
runs-on: ubuntu-latest
165+
ubuntu-cpu-cmake:
166+
runs-on: ubuntu-22.04
165167

166168
steps:
167169
- name: Clone
@@ -181,7 +183,10 @@ jobs:
181183
run: |
182184
mkdir build
183185
cd build
184-
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_CURL=ON -DGGML_RPC=ON
186+
cmake .. \
187+
-DLLAMA_FATAL_WARNINGS=ON \
188+
-DLLAMA_CURL=ON \
189+
-DGGML_RPC=ON
185190
cmake --build . --config Release -j $(nproc)
186191
187192
- name: Test
@@ -256,7 +261,10 @@ jobs:
256261
run: |
257262
mkdir build
258263
cd build
259-
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
264+
cmake .. \
265+
-DLLAMA_FATAL_WARNINGS=ON \
266+
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
267+
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }}
260268
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
261269
262270
- name: Build (no OpenMP)
@@ -265,7 +273,11 @@ jobs:
265273
run: |
266274
mkdir build
267275
cd build
268-
cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} -DGGML_OPENMP=OFF
276+
cmake .. \
277+
-DLLAMA_FATAL_WARNINGS=ON \
278+
-DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON \
279+
-DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
280+
-DGGML_OPENMP=OFF
269281
cmake --build . --config ${{ matrix.build_type }} -j $(nproc)
270282
271283
- name: Test
@@ -295,7 +307,8 @@ jobs:
295307
run: |
296308
mkdir build
297309
cd build
298-
cmake -DGGML_RPC=ON ..
310+
cmake .. \
311+
-DGGML_RPC=ON
299312
cmake --build . --config Release -j $(nproc)
300313
301314
- name: Test
@@ -325,7 +338,8 @@ jobs:
325338
run: |
326339
mkdir build
327340
cd build
328-
cmake -DGGML_VULKAN=ON ..
341+
cmake .. \
342+
-DGGML_VULKAN=ON
329343
cmake --build . --config Release -j $(nproc)
330344
331345
- name: Test
@@ -352,13 +366,18 @@ jobs:
352366
- name: Build with native CMake HIP support
353367
id: cmake_build
354368
run: |
355-
cmake -B build -S . -DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" -DGGML_HIP=ON
369+
cmake -B build -S . \
370+
-DCMAKE_HIP_COMPILER="$(hipconfig -l)/clang" \
371+
-DGGML_HIP=ON
356372
cmake --build build --config Release -j $(nproc)
357373
358374
- name: Build with legacy HIP support
359375
id: cmake_build_legacy_hip
360376
run: |
361-
cmake -B build2 -S . -DCMAKE_C_COMPILER=hipcc -DCMAKE_CXX_COMPILER=hipcc -DGGML_HIP=ON
377+
cmake -B build2 -S . \
378+
-DCMAKE_C_COMPILER=hipcc \
379+
-DCMAKE_CXX_COMPILER=hipcc \
380+
-DGGML_HIP=ON
362381
cmake --build build2 --config Release -j $(nproc)
363382
364383
ubuntu-22-cmake-musa:
@@ -379,7 +398,8 @@ jobs:
379398
- name: Build with native CMake MUSA support
380399
id: cmake_build
381400
run: |
382-
cmake -B build -S . -DGGML_MUSA=ON
401+
cmake -B build -S . \
402+
-DGGML_MUSA=ON
383403
cmake --build build --config Release -j $(nproc)
384404
385405
ubuntu-22-cmake-sycl:
@@ -420,7 +440,10 @@ jobs:
420440
source /opt/intel/oneapi/setvars.sh
421441
mkdir build
422442
cd build
423-
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
443+
cmake .. \
444+
-DGGML_SYCL=ON \
445+
-DCMAKE_C_COMPILER=icx \
446+
-DCMAKE_CXX_COMPILER=icpx
424447
cmake --build . --config Release -j $(nproc)
425448
426449
ubuntu-22-cmake-sycl-fp16:
@@ -461,42 +484,13 @@ jobs:
461484
source /opt/intel/oneapi/setvars.sh
462485
mkdir build
463486
cd build
464-
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON ..
487+
cmake .. \
488+
-DGGML_SYCL=ON \
489+
-DCMAKE_C_COMPILER=icx \
490+
-DCMAKE_CXX_COMPILER=icpx \
491+
-DGGML_SYCL_F16=ON
465492
cmake --build . --config Release -j $(nproc)
466493
467-
# TODO: build with GGML_METAL=OFF because test-backend-ops fail on "Apple Paravirtual device" and I don't know
468-
# how to debug it.
469-
# ref: https://github.com/ggerganov/llama.cpp/actions/runs/7132125951/job/19422043567?pr=4359#step:5:6584
470-
# would be great if we fix these
471-
macOS-latest-cmake:
472-
runs-on: macos-latest
473-
474-
steps:
475-
- name: Clone
476-
id: checkout
477-
uses: actions/checkout@v4
478-
479-
- name: Dependencies
480-
id: depends
481-
continue-on-error: true
482-
run: |
483-
brew update
484-
485-
- name: Build
486-
id: cmake_build
487-
run: |
488-
sysctl -a
489-
mkdir build
490-
cd build
491-
cmake -DLLAMA_FATAL_WARNINGS=ON -DGGML_METAL=OFF ..
492-
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu)
493-
494-
- name: Test
495-
id: cmake_test
496-
run: |
497-
cd build
498-
ctest -L main --verbose --timeout 900
499-
500494
macOS-latest-cmake-ios:
501495
runs-on: macos-latest
502496

@@ -827,7 +821,13 @@ jobs:
827821
828822
- name: Build with CMake
829823
run: |
830-
cmake -S . -B build -G Ninja -DCMAKE_BUILD_TYPE=Release -DGGML_NATIVE=OFF -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=89-real -DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined -DLLAMA_FATAL_WARNINGS=ON
824+
cmake -S . -B build -G Ninja \
825+
-DCMAKE_BUILD_TYPE=Release \
826+
-DCMAKE_CUDA_ARCHITECTURES=89-real \
827+
-DCMAKE_EXE_LINKER_FLAGS=-Wl,--allow-shlib-undefined \
828+
-DLLAMA_FATAL_WARNINGS=ON \
829+
-DGGML_NATIVE=OFF \
830+
-DGGML_CUDA=ON
831831
cmake --build build
832832
833833
windows-2019-cmake-cuda:
@@ -916,7 +916,11 @@ jobs:
916916
shell: cmd
917917
run: |
918918
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
919-
cmake -S . -B build -G "Ninja Multi-Config" -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DGGML_RPC=ON
919+
cmake -S . -B build -G "Ninja Multi-Config" \
920+
-DLLAMA_BUILD_SERVER=ON \
921+
-DGGML_NATIVE=OFF \
922+
-DGGML_CUDA=ON \
923+
-DGGML_RPC=ON
920924
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
921925
cmake --build build --config Release -j %NINJA_JOBS% -t ggml
922926
cmake --build build --config Release
@@ -1201,8 +1205,7 @@ jobs:
12011205
runs-on: ubuntu-latest
12021206

12031207
needs:
1204-
- ubuntu-latest-cmake
1205-
- macOS-latest-cmake
1208+
- ubuntu-cpu-cmake
12061209
- windows-latest-cmake
12071210
- windows-2019-cmake-cuda
12081211
- windows-latest-cmake-hip-release
@@ -1461,3 +1464,37 @@ jobs:
14611464
# popd
14621465
# emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
14631466
# make
1467+
1468+
openEuler-latest-cmake-cann:
1469+
if: ${{ github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'Ascend NPU') }}
1470+
defaults:
1471+
run:
1472+
shell: bash -el {0}
1473+
runs-on: ubuntu-24.04-arm
1474+
strategy:
1475+
matrix:
1476+
cann:
1477+
- '8.0.rc3.beta1-910b-openeuler22.03-py3.10'
1478+
device:
1479+
- 'ascend910b3'
1480+
build:
1481+
- 'Release'
1482+
container: ascendai/cann:${{ matrix.cann }}
1483+
steps:
1484+
- name: Checkout
1485+
uses: actions/checkout@v4
1486+
1487+
- name: Dependencies
1488+
run: |
1489+
yum update -y
1490+
yum install -y git gcc gcc-c++ make cmake
1491+
1492+
- name: Build
1493+
run: |
1494+
export LD_LIBRARY_PATH=${ASCEND_TOOLKIT_HOME}/lib64:${ASCEND_TOOLKIT_HOME}/$(uname -m)-linux/devlib/:${LD_LIBRARY_PATH}
1495+
1496+
cmake -S . -B build \
1497+
-DCMAKE_BUILD_TYPE=${{ matrix.build }} \
1498+
-DGGML_CANN=on \
1499+
-DSOC_TYPE=${{ matrix.device }}
1500+
cmake --build build -j $(nproc)

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ endif()
1616
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
1717

1818
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
19+
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
1920

2021
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
2122
set(LLAMA_STANDALONE ON)

docs/backend/SYCL.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,7 +133,7 @@ The docker build option is currently limited to *intel GPU* targets.
133133
### Build image
134134
```sh
135135
# Using FP16
136-
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" -f .devops/llama-cli-intel.Dockerfile .
136+
docker build -t llama-cpp-sycl --build-arg="GGML_SYCL_F16=ON" --target light -f .devops/intel.Dockerfile .
137137
```
138138

139139
*Notes*:

docs/build.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -286,7 +286,7 @@ You don't need to install Vulkan SDK. It will be installed inside the container.
286286

287287
```sh
288288
# Build the image
289-
docker build -t llama-cpp-vulkan -f .devops/llama-cli-vulkan.Dockerfile .
289+
docker build -t llama-cpp-vulkan --target light -f .devops/vulkan.Dockerfile .
290290

291291
# Then, use it:
292292
docker run -it --rm -v "$(pwd):/app:Z" --device /dev/dri/renderD128:/dev/dri/renderD128 --device /dev/dri/card1:/dev/dri/card1 llama-cpp-vulkan -m "/app/models/YOUR_MODEL_FILE" -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33

docs/docker.md

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,9 +60,9 @@ Assuming one has the [nvidia-container-toolkit](https://github.com/NVIDIA/nvidia
6060
## Building Docker locally
6161

6262
```bash
63-
docker build -t local/llama.cpp:full-cuda -f .devops/full-cuda.Dockerfile .
64-
docker build -t local/llama.cpp:light-cuda -f .devops/llama-cli-cuda.Dockerfile .
65-
docker build -t local/llama.cpp:server-cuda -f .devops/llama-server-cuda.Dockerfile .
63+
docker build -t local/llama.cpp:full-cuda --target full -f .devops/cuda.Dockerfile .
64+
docker build -t local/llama.cpp:light-cuda --target light -f .devops/cuda.Dockerfile .
65+
docker build -t local/llama.cpp:server-cuda --target server -f .devops/cuda.Dockerfile .
6666
```
6767

6868
You may want to pass in some different `ARGS`, depending on the CUDA environment supported by your container host, as well as the GPU architecture.
@@ -95,9 +95,9 @@ Assuming one has the [mt-container-toolkit](https://developer.mthreads.com/musa/
9595
## Building Docker locally
9696

9797
```bash
98-
docker build -t local/llama.cpp:full-musa -f .devops/full-musa.Dockerfile .
99-
docker build -t local/llama.cpp:light-musa -f .devops/llama-cli-musa.Dockerfile .
100-
docker build -t local/llama.cpp:server-musa -f .devops/llama-server-musa.Dockerfile .
98+
docker build -t local/llama.cpp:full-musa --target full -f .devops/musa.Dockerfile .
99+
docker build -t local/llama.cpp:light-musa --target light -f .devops/musa.Dockerfile .
100+
docker build -t local/llama.cpp:server-musa --target server -f .devops/musa.Dockerfile .
101101
```
102102

103103
You may want to pass in some different `ARGS`, depending on the MUSA environment supported by your container host, as well as the GPU architecture.

examples/run/README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@
33
The purpose of this example is to demonstrate a minimal usage of llama.cpp for running models.
44

55
```bash
6-
llama-run granite-code
6+
llama-run granite3-moe
77
```
88

99
```bash
10-
llama-run -h
1110
Description:
1211
Runs a llm
1312

@@ -17,7 +16,7 @@ Usage:
1716
Options:
1817
-c, --context-size <value>
1918
Context size (default: 2048)
20-
-n, --ngl <value>
19+
-n, -ngl, --ngl <value>
2120
Number of GPU layers (default: 0)
2221
--temp <value>
2322
Temperature (default: 0.8)

examples/server/public/index.html.gz

658 Bytes
Binary file not shown.

examples/server/webui/index.html

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ <h2 class="font-bold ml-4">Conversations</h2>
141141
:msg="pendingMsg"
142142
:key="pendingMsg.id"
143143
:is-generating="isGenerating"
144+
:show-thought-in-progress="config.showThoughtInProgress"
144145
:edit-user-msg-and-regenerate="() => {}"
145146
:regenerate-msg="() => {}"></message-bubble>
146147
</div>
@@ -202,6 +203,20 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
202203
</template>
203204
</div>
204205
</details>
206+
<!-- Section: Reasoning models -->
207+
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
208+
<summary class="collapse-title font-bold">Reasoning models</summary>
209+
<div class="collapse-content">
210+
<div class="flex flex-row items-center mb-2">
211+
<input type="checkbox" class="checkbox" v-model="config.showThoughtInProgress" />
212+
<span class="ml-4">Expand though process by default for generating message</span>
213+
</div>
214+
<div class="flex flex-row items-center mb-2">
215+
<input type="checkbox" class="checkbox" v-model="config.excludeThoughtOnReq" />
216+
<span class="ml-4">Exclude thought process when sending request to API (Recommended for DeepSeek-R1)</span>
217+
</div>
218+
</div>
219+
</details>
205220
<!-- Section: Advanced config -->
206221
<details class="collapse collapse-arrow bg-base-200 mb-2 overflow-visible">
207222
<summary class="collapse-title font-bold">Advanced config</summary>
@@ -261,7 +276,17 @@ <h3 class="text-lg font-bold mb-6">Settings</h3>
261276
<span v-if="msg.content === null" class="loading loading-dots loading-md"></span>
262277
<!-- render message as markdown -->
263278
<div v-else dir="auto">
264-
<vue-markdown :source="msg.content"></vue-markdown>
279+
<details v-if="msg.role === 'assistant' && splitMsgContent.cot" class="collapse bg-base-200 collapse-arrow mb-4" :open="splitMsgContent.isThinking && showThoughtInProgress">
280+
<summary class="collapse-title">
281+
<span v-if="splitMsgContent.isThinking">
282+
<span v-if="isGenerating" class="loading loading-spinner loading-md mr-2" style="vertical-align: middle;"></span>
283+
<b>Thinking</b>
284+
</span>
285+
<b v-else>Thought Process</b>
286+
</summary>
287+
<vue-markdown :source="splitMsgContent.cot" dir="auto" class="collapse-content"></vue-markdown>
288+
</details>
289+
<vue-markdown :source="splitMsgContent.content"></vue-markdown>
265290
</div>
266291
<!-- render timings if enabled -->
267292
<div class="dropdown dropdown-hover dropdown-top mt-2" v-if="timings && config.showTokensPerSecond">

0 commit comments

Comments
 (0)